diff --git a/.clang-format b/.clang-format
index 3cd3f6da33..6cba3dca4b 100644
--- a/.clang-format
+++ b/.clang-format
@@ -25,6 +25,7 @@ AlwaysBreakTemplateDeclarations: Yes
 AttributeMacros: [
                   '_CCCL_ALIGNAS_TYPE',
                   '_CCCL_ALIGNAS',
+                  '_CCCL_ALWAYS_INLINE',
                   '_CCCL_CONSTEXPR_CXX14',
                   '_CCCL_CONSTEXPR_CXX17',
                   '_CCCL_CONSTEXPR_CXX20',
@@ -32,46 +33,27 @@ AttributeMacros: [
                   '_CCCL_DEVICE',
                   '_CCCL_FALLTHROUGH',
                   '_CCCL_FORCEINLINE',
+                  '_CCCL_HIDE_FROM_ABI',
                   '_CCCL_HOST_DEVICE',
                   '_CCCL_HOST',
                   '_CCCL_NO_UNIQUE_ADDRESS',
                   '_CCCL_NODISCARD_FRIEND',
                   '_CCCL_NODISCARD',
                   '_CCCL_NORETURN',
+                  '_CCCL_TYPE_VISIBILITY_DEFAULT',
                   '_CCCL_VISIBILITY_HIDDEN',
                   'CUB_RUNTIME_FUNCTION',
                   'CUB_DETAIL_KERNEL_ATTRIBUTES',
                   'THRUST_RUNTIME_FUNCTION',
                   'THRUST_DETAIL_KERNEL_ATTRIBUTES',
                   '_LIBCUDACXX_ALIGNOF',
-                  '_LIBCUDACXX_ALWAYS_INLINE',
                   '_LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS',
-                  '_LIBCUDACXX_CONSTINIT',
                   '_LIBCUDACXX_DEPRECATED_IN_CXX11',
                   '_LIBCUDACXX_DEPRECATED_IN_CXX14',
                   '_LIBCUDACXX_DEPRECATED_IN_CXX17',
                   '_LIBCUDACXX_DEPRECATED_IN_CXX20',
                   '_LIBCUDACXX_DEPRECATED',
-                  '_LIBCUDACXX_DISABLE_EXTENTSION_WARNING',
-                  '_LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION',
-                  '_LIBCUDACXX_EXPORTED_FROM_ABI',
-                  '_LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS',
-                  '_LIBCUDACXX_HIDDEN',
-                  '_LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1',
                   '_LIBCUDACXX_HIDE_FROM_ABI',
-                  '_LIBCUDACXX_INLINE_VISIBILITY',
-                  '_LIBCUDACXX_INTERNAL_LINKAGE',
-                  '_LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS',
-                  '_LIBCUDACXX_NO_DESTROY',
-                  '_LIBCUDACXX_NO_SANITIZE',
-                  '_LIBCUDACXX_NOALIAS',
-                  '_LIBCUDACXX_OVERRIDABLE_FUNC_VIS',
-                  '_LIBCUDACXX_STANDALONE_DEBUG',
-                  '_LIBCUDACXX_TEMPLATE_DATA_VIS',
-                  '_LIBCUDACXX_TEMPLATE_VIS',
-                  '_LIBCUDACXX_THREAD_SAFETY_ANNOTATION',
-                  '_LIBCUDACXX_USING_IF_EXISTS',
-                  '_LIBCUDACXX_WEAK',
                  ]
 BinPackArguments: false
 BinPackParameters: false
@@ -108,6 +90,9 @@ IfMacros: [
 IndentWrappedFunctionNames: false
 IncludeBlocks:   Regroup
 IncludeCategories:
+  - Regex:           '^<cuda/experimental/__async/prologue.cuh>'
+    Priority:            0x7FFFFFFF
+    SortPriority:        0x7FFFFFFF
   - Regex:           '^<(cuda/std/detail/__config|cub/config.cuh|thrust/detail/config.h|thrust/system/cuda/config.h)'
     Priority:            0
     SortPriority:        0
diff --git a/.devcontainer/cuda12.5-gcc10/devcontainer.json b/.devcontainer/cuda12.5-gcc10/devcontainer.json
deleted file mode 100644
index 61459a25fc..0000000000
--- a/.devcontainer/cuda12.5-gcc10/devcontainer.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-gcc10-cuda12.5",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-gcc10",
-    "CCCL_CUDA_VERSION": "12.5",
-    "CCCL_HOST_COMPILER": "gcc",
-    "CCCL_HOST_COMPILER_VERSION": "10",
-    "CCCL_BUILD_INFIX": "cuda12.5-gcc10"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.5-gcc10"
-}
diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.6-gcc10/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-gcc12/devcontainer.json
rename to .devcontainer/cuda12.6-gcc10/devcontainer.json
index 1d16b6aa61..1d49b0ebc5 100644
--- a/.devcontainer/cuda12.5-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.6-gcc10/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc10-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-gcc12",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc10",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "gcc",
-    "CCCL_HOST_COMPILER_VERSION": "12",
-    "CCCL_BUILD_INFIX": "cuda12.5-gcc12"
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc10"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-gcc12"
+  "name": "cuda12.6-gcc10"
 }
diff --git a/.devcontainer/cuda12.5-gcc11/devcontainer.json b/.devcontainer/cuda12.6-gcc11/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.5-gcc11/devcontainer.json
rename to .devcontainer/cuda12.6-gcc11/devcontainer.json
index 184de8734c..ba1e0ea891 100644
--- a/.devcontainer/cuda12.5-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.6-gcc11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-gcc11",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc11",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "11",
-    "CCCL_BUILD_INFIX": "cuda12.5-gcc11"
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc11"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-gcc11"
+  "name": "cuda12.6-gcc11"
 }
diff --git a/.devcontainer/cuda12.6-gcc12/devcontainer.json b/.devcontainer/cuda12.6-gcc12/devcontainer.json
new file mode 100644
index 0000000000..d25796f6cc
--- /dev/null
+++ b/.devcontainer/cuda12.6-gcc12/devcontainer.json
@@ -0,0 +1,53 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.6",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc12",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda12.6-gcc12"
+}
diff --git a/.devcontainer/cuda12.6-gcc13/devcontainer.json b/.devcontainer/cuda12.6-gcc13/devcontainer.json
new file mode 100644
index 0000000000..666f0e6621
--- /dev/null
+++ b/.devcontainer/cuda12.6-gcc13/devcontainer.json
@@ -0,0 +1,53 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.6",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc13",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "13",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc13"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda12.6-gcc13"
+}
diff --git a/.devcontainer/cuda12.5-gcc9/devcontainer.json b/.devcontainer/cuda12.6-gcc7/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-gcc9/devcontainer.json
rename to .devcontainer/cuda12.6-gcc7/devcontainer.json
index 333c11b3cc..0ca9492cd3 100644
--- a/.devcontainer/cuda12.5-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.6-gcc7/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc7-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-gcc9",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc7",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "gcc",
-    "CCCL_HOST_COMPILER_VERSION": "9",
-    "CCCL_BUILD_INFIX": "cuda12.5-gcc9"
+    "CCCL_HOST_COMPILER_VERSION": "7",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc7"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-gcc9"
+  "name": "cuda12.6-gcc7"
 }
diff --git a/.devcontainer/cuda12.5-gcc8/devcontainer.json b/.devcontainer/cuda12.6-gcc8/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.5-gcc8/devcontainer.json
rename to .devcontainer/cuda12.6-gcc8/devcontainer.json
index 10b44d31f1..8e3aacd04d 100644
--- a/.devcontainer/cuda12.5-gcc8/devcontainer.json
+++ b/.devcontainer/cuda12.6-gcc8/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-gcc8-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc8-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-gcc8",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc8",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "8",
-    "CCCL_BUILD_INFIX": "cuda12.5-gcc8"
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc8"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-gcc8"
+  "name": "cuda12.6-gcc8"
 }
diff --git a/.devcontainer/cuda12.5-gcc7/devcontainer.json b/.devcontainer/cuda12.6-gcc9/devcontainer.json
similarity index 88%
rename from .devcontainer/cuda12.5-gcc7/devcontainer.json
rename to .devcontainer/cuda12.6-gcc9/devcontainer.json
index 9d5d356ad5..4c30069c48 100644
--- a/.devcontainer/cuda12.5-gcc7/devcontainer.json
+++ b/.devcontainer/cuda12.6-gcc9/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-gcc7-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc9-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-gcc7",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc9",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "gcc",
-    "CCCL_HOST_COMPILER_VERSION": "7",
-    "CCCL_BUILD_INFIX": "cuda12.5-gcc7"
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc9"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-gcc7"
+  "name": "cuda12.6-gcc9"
 }
diff --git a/.devcontainer/cuda12.5-llvm12/devcontainer.json b/.devcontainer/cuda12.6-llvm10/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-llvm12/devcontainer.json
rename to .devcontainer/cuda12.6-llvm10/devcontainer.json
index e1cbc4ecb7..59b03b60d4 100644
--- a/.devcontainer/cuda12.5-llvm12/devcontainer.json
+++ b/.devcontainer/cuda12.6-llvm10/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-llvm12-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm10-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-llvm12",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm10",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "12",
-    "CCCL_BUILD_INFIX": "cuda12.5-llvm12"
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm10"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-llvm12"
+  "name": "cuda12.6-llvm10"
 }
diff --git a/.devcontainer/cuda12.5-llvm13/devcontainer.json b/.devcontainer/cuda12.6-llvm11/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-llvm13/devcontainer.json
rename to .devcontainer/cuda12.6-llvm11/devcontainer.json
index 6fbbf56b79..8907106550 100644
--- a/.devcontainer/cuda12.5-llvm13/devcontainer.json
+++ b/.devcontainer/cuda12.6-llvm11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-llvm13-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm11-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-llvm13",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm11",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "13",
-    "CCCL_BUILD_INFIX": "cuda12.5-llvm13"
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm11"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-llvm13"
+  "name": "cuda12.6-llvm11"
 }
diff --git a/.devcontainer/cuda12.5-llvm14/devcontainer.json b/.devcontainer/cuda12.6-llvm12/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-llvm14/devcontainer.json
rename to .devcontainer/cuda12.6-llvm12/devcontainer.json
index b8528e989f..522fd7fb80 100644
--- a/.devcontainer/cuda12.5-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.6-llvm12/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-llvm14-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm12-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-llvm14",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm12",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "14",
-    "CCCL_BUILD_INFIX": "cuda12.5-llvm14"
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm12"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-llvm14"
+  "name": "cuda12.6-llvm12"
 }
diff --git a/.devcontainer/cuda12.5-llvm15/devcontainer.json b/.devcontainer/cuda12.6-llvm13/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-llvm15/devcontainer.json
rename to .devcontainer/cuda12.6-llvm13/devcontainer.json
index 768d3163ee..bc9f36b98b 100644
--- a/.devcontainer/cuda12.5-llvm15/devcontainer.json
+++ b/.devcontainer/cuda12.6-llvm13/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-llvm15-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm13-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-llvm15",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm13",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "15",
-    "CCCL_BUILD_INFIX": "cuda12.5-llvm15"
+    "CCCL_HOST_COMPILER_VERSION": "13",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm13"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-llvm15"
+  "name": "cuda12.6-llvm13"
 }
diff --git a/.devcontainer/cuda12.5-llvm16/devcontainer.json b/.devcontainer/cuda12.6-llvm14/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-llvm16/devcontainer.json
rename to .devcontainer/cuda12.6-llvm14/devcontainer.json
index 8ba700fa4e..bf43444cb5 100644
--- a/.devcontainer/cuda12.5-llvm16/devcontainer.json
+++ b/.devcontainer/cuda12.6-llvm14/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-llvm16-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm14-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-llvm16",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm14",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "16",
-    "CCCL_BUILD_INFIX": "cuda12.5-llvm16"
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm14"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-llvm16"
+  "name": "cuda12.6-llvm14"
 }
diff --git a/.devcontainer/cuda12.5-llvm17/devcontainer.json b/.devcontainer/cuda12.6-llvm15/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-llvm17/devcontainer.json
rename to .devcontainer/cuda12.6-llvm15/devcontainer.json
index 0de5689fdc..a6228c43a0 100644
--- a/.devcontainer/cuda12.5-llvm17/devcontainer.json
+++ b/.devcontainer/cuda12.6-llvm15/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-llvm17-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm15-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-llvm17",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm15",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "17",
-    "CCCL_BUILD_INFIX": "cuda12.5-llvm17"
+    "CCCL_HOST_COMPILER_VERSION": "15",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm15"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-llvm17"
+  "name": "cuda12.6-llvm15"
 }
diff --git a/.devcontainer/cuda12.5-llvm11/devcontainer.json b/.devcontainer/cuda12.6-llvm16/devcontainer.json
similarity index 88%
rename from .devcontainer/cuda12.5-llvm11/devcontainer.json
rename to .devcontainer/cuda12.6-llvm16/devcontainer.json
index a216720e5d..e0c8fd3212 100644
--- a/.devcontainer/cuda12.5-llvm11/devcontainer.json
+++ b/.devcontainer/cuda12.6-llvm16/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-llvm11-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm16-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-llvm11",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm16",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "11",
-    "CCCL_BUILD_INFIX": "cuda12.5-llvm11"
+    "CCCL_HOST_COMPILER_VERSION": "16",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm16"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-llvm11"
+  "name": "cuda12.6-llvm16"
 }
diff --git a/.devcontainer/cuda12.6-llvm17/devcontainer.json b/.devcontainer/cuda12.6-llvm17/devcontainer.json
new file mode 100644
index 0000000000..1920aa035d
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm17/devcontainer.json
@@ -0,0 +1,53 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm17-cuda12.6",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm17",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "17",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm17"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda12.6-llvm17"
+}
diff --git a/.devcontainer/cuda12.5-llvm10/devcontainer.json b/.devcontainer/cuda12.6-llvm18/devcontainer.json
similarity index 88%
rename from .devcontainer/cuda12.5-llvm10/devcontainer.json
rename to .devcontainer/cuda12.6-llvm18/devcontainer.json
index 8e3e19d4fc..80e92119f7 100644
--- a/.devcontainer/cuda12.5-llvm10/devcontainer.json
+++ b/.devcontainer/cuda12.6-llvm18/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-llvm10-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm18-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-llvm10",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm18",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "10",
-    "CCCL_BUILD_INFIX": "cuda12.5-llvm10"
+    "CCCL_HOST_COMPILER_VERSION": "18",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm18"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-llvm10"
+  "name": "cuda12.6-llvm18"
 }
diff --git a/.devcontainer/cuda12.5-llvm9/devcontainer.json b/.devcontainer/cuda12.6-llvm9/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.5-llvm9/devcontainer.json
rename to .devcontainer/cuda12.6-llvm9/devcontainer.json
index d34ae01844..6ef30d1657 100644
--- a/.devcontainer/cuda12.5-llvm9/devcontainer.json
+++ b/.devcontainer/cuda12.6-llvm9/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-llvm9-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-llvm9",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm9",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "llvm",
     "CCCL_HOST_COMPILER_VERSION": "9",
-    "CCCL_BUILD_INFIX": "cuda12.5-llvm9"
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm9"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-llvm9"
+  "name": "cuda12.6-llvm9"
 }
diff --git a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
rename to .devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json
index a530527cac..a7c7e3cdff 100644
--- a/.devcontainer/cuda12.5-oneapi2023.2.0/devcontainer.json
+++ b/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-oneapi2023.2.0-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-oneapi2023.2.0-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-oneapi2023.2.0",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-oneapi2023.2.0",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "oneapi",
     "CCCL_HOST_COMPILER_VERSION": "2023.2.0",
-    "CCCL_BUILD_INFIX": "cuda12.5-oneapi2023.2.0"
+    "CCCL_BUILD_INFIX": "cuda12.6-oneapi2023.2.0"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-oneapi2023.2.0"
+  "name": "cuda12.6-oneapi2023.2.0"
 }
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 0f3fbb36f5..666f0e6621 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.6",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.5-gcc13",
-    "CCCL_CUDA_VERSION": "12.5",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc13",
+    "CCCL_CUDA_VERSION": "12.6",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "13",
-    "CCCL_BUILD_INFIX": "cuda12.5-gcc13"
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc13"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -49,5 +49,5 @@
       }
     }
   },
-  "name": "cuda12.5-gcc13"
+  "name": "cuda12.6-gcc13"
 }
diff --git a/.github/actions/docs-build/action.yml b/.github/actions/docs-build/action.yml
index 8b997f4741..db7f323174 100644
--- a/.github/actions/docs-build/action.yml
+++ b/.github/actions/docs-build/action.yml
@@ -38,6 +38,8 @@ runs:
         cp -rf ./docs/_build/docs/cudax/latest/* _site/cudax
         mkdir _site/cuda_cooperative
         cp -rf ./docs/_build/docs/cuda_cooperative/latest/* _site/cuda_cooperative
+        mkdir _site/cuda_parallel
+        cp -rf ./docs/_build/docs/cuda_parallel/latest/* _site/cuda_parallel
         ./docs/scrape_docs.bash ./_site
 
     # Update docs as workflow artifact:
diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py
index a3b216e3fd..cd2aad01d9 100755
--- a/.github/actions/workflow-build/build-workflow.py
+++ b/.github/actions/workflow-build/build-workflow.py
@@ -580,6 +580,14 @@ def remove_dispatch_job_from_container(job, container):
     return False
 
 
+def index_of_dispatch_job_in_container(job, container):
+    "Find the index of a dispatch job in a container, using compare_dispatch_jobs."
+    for idx, job2 in enumerate(container):
+        if compare_dispatch_jobs(job, job2):
+            return idx
+    return None
+
+
 def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
     workflow_dispatch_groups = copy.deepcopy(workflow_dispatch_groups_orig)
 
@@ -614,7 +622,7 @@ def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig):
             producer = producers[0]
 
             if dispatch_job_in_container(producer, merged_producers):
-                producer_index = merged_producers.index(producers)
+                producer_index = index_of_dispatch_job_in_container(producer, merged_producers)
                 matching_consumers = merged_consumers[producer_index]
 
                 producer_name = producer['name']
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 198727dc5d..bddb6f1a73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,7 @@ option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." ${CCCL_TOPLEVEL_P
 option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." ${CCCL_TOPLEVEL_PROJECT})
 option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." ${CCCL_TOPLEVEL_PROJECT})
 option(CCCL_ENABLE_BENCHMARKS "Enable CUDA C++ Core Library benchmarks." OFF)
+option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF)
 
 option(CCCL_ENABLE_UNSTABLE "Enable targets and developer build options for unstable projects." OFF)
 
@@ -45,6 +46,11 @@ if (CCCL_ENABLE_UNSTABLE)
   option(CCCL_ENABLE_CUDAX "Enable the CUDA Experimental developer build." ON)
 endif()
 
+option(CCCL_DISABLE_EXCEPTIONS "Disable use of exceptions within CCCL libraries." OFF)
+if (CCCL_DISABLE_EXCEPTIONS)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCCCL_DISABLE_EXCEPTIONS")
+endif()
+
 include(CTest)
 enable_testing()
 
@@ -77,6 +83,10 @@ if (CCCL_ENABLE_UNSTABLE)
   add_subdirectory(cudax)
 endif()
 
+if (CCCL_ENABLE_C)
+  add_subdirectory(c)
+endif()
+
 if (CCCL_ENABLE_TESTING)
   add_subdirectory(test)
 endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index 10bdd83539..ecc9b22761 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -38,6 +38,7 @@
         "CCCL_ENABLE_TESTING": true,
         "CCCL_ENABLE_EXAMPLES": true,
         "CCCL_ENABLE_BENCHMARKS": true,
+        "CCCL_ENABLE_C": true,
         "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true,
         "CUB_ENABLE_TESTING": true,
         "CUB_ENABLE_EXAMPLES": true,
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b28e0b9ea2..f2088f4338 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -3,6 +3,9 @@
 
 Thank you for your interest in contributing to the CUDA C++ Core Libraries (CCCL)!
 
+Looking for ideas for your first contribution? Check out: ![GitHub Issues or Pull Requests by label](https://img.shields.io/github/issues/nvidia/cccl/good%20first%20issue)
+
+
 ## Getting Started
 
 1. **Fork & Clone the Repository**:
diff --git a/benchmarks/scripts/cccl/bench/bench.py b/benchmarks/scripts/cccl/bench/bench.py
index f93f2eff57..049dcbb601 100644
--- a/benchmarks/scripts/cccl/bench/bench.py
+++ b/benchmarks/scripts/cccl/bench/bench.py
@@ -19,7 +19,7 @@ def first_val(my_dict):
     first_value = values[0]
 
     if not all(value == first_value for value in values):
-        raise ValueError('All values in the dictionary are not equal')
+        raise ValueError('All values in the dictionary are not equal. First value: {} All values: {}'.format(first_value, values))
 
     return first_value
 
@@ -648,11 +648,11 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True):
             p.wait(timeout=timeout)
             elapsed = time.time() - begin
 
-            logger.info("finished benchmark {} with {} ({}) in {}s".format(self.label(), ct_point, p.returncode, elapsed))
+            logger.info("finished benchmark {} with {} ({}) in {:.3f}s".format(self.label(), ct_point, p.returncode, elapsed))
 
             return BenchResult(result_path, p.returncode, elapsed)
         except subprocess.TimeoutExpired:
-            logger.info("benchmark {} with {} reached timeout of {}s".format(self.label(), ct_point, timeout))
+            logger.info("benchmark {} with {} reached timeout of {:.3f}s".format(self.label(), ct_point, timeout))
             os.killpg(os.getpgid(p.pid), signal.SIGTERM)
             return BenchResult(None, 42, float('inf'))
 
diff --git a/benchmarks/scripts/cccl/bench/cmake.py b/benchmarks/scripts/cccl/bench/cmake.py
index 095531a005..4340c999c6 100644
--- a/benchmarks/scripts/cccl/bench/cmake.py
+++ b/benchmarks/scripts/cccl/bench/cmake.py
@@ -80,7 +80,7 @@ def do_build(self, bench, timeout):
                               stderr=subprocess.DEVNULL)
           p.wait(timeout=timeout)
           elapsed = time.time() - begin
-          logger.info("finished build for {} ({}) in {}s".format(bench.label(), p.returncode, elapsed))
+          logger.info("finished build for {} (exit code: {}) in {:.3f}s".format(bench.label(), p.returncode, elapsed))
 
           return Build(p.returncode, elapsed)
       except subprocess.TimeoutExpired:
diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
new file mode 100644
index 0000000000..3e3783903b
--- /dev/null
+++ b/c/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.30)
+
+project(cccl.c LANGUAGES CUDA CXX)
+
+add_library(cccl.c SHARED src/reduce.cu)
+set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20)
+set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20)
+
+find_package(CUDAToolkit REQUIRED)
+
+# TODO Use static versions of cudart, nvrtc, and nvJitLink
+target_link_libraries(cccl.c PRIVATE CUDA::cudart
+                                     CUDA::nvrtc
+                                     CUDA::nvJitLink
+                                     CUDA::cuda_driver)
+target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1)
+target_include_directories(cccl.c PUBLIC "include")
+
+add_subdirectory(test)
diff --git a/c/include/cccl/reduce.h b/c/include/cccl/reduce.h
new file mode 100644
index 0000000000..5047625a85
--- /dev/null
+++ b/c/include/cccl/reduce.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning."
+#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv
+
+#  include <cuda.h>
+
+#  include <cccl/types.h>
+
+struct cccl_device_reduce_build_result_t
+{
+  int cc;
+  void* cubin;
+  size_t cubin_size;
+  CUlibrary library;
+  CUkernel single_tile_kernel;
+  CUkernel single_tile_second_kernel;
+  CUkernel reduction_kernel;
+};
+
+// TODO return a union of nvtx/cuda/nvrtc errors or a string?
+extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
+  cccl_device_reduce_build_result_t* build,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce(
+  cccl_device_reduce_build_result_t build,
+  void* d_temp_storage,
+  size_t* temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  CUstream stream) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr);
+
+#endif // CCCL_C_EXPERIMENTAL
diff --git a/c/include/cccl/types.h b/c/include/cccl/types.h
new file mode 100644
index 0000000000..781b9f9ea6
--- /dev/null
+++ b/c/include/cccl/types.h
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning."
+#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv
+
+#  if defined(_WIN32)
+#    define CCCL_C_API __declspec(dllexport)
+#  else
+#    define CCCL_C_API __attribute__((visibility("default")))
+#  endif
+
+enum class cccl_type_enum
+{
+  INT8    = 0,
+  INT16   = 1,
+  INT32   = 2,
+  INT64   = 3,
+  UINT8   = 4,
+  UINT16  = 5,
+  UINT32  = 6,
+  UINT64  = 7,
+  FLOAT32 = 8,
+  FLOAT64 = 9,
+  STORAGE = 10
+};
+
+struct cccl_type_info
+{
+  int size;
+  int alignment;
+  cccl_type_enum type;
+};
+
+enum class cccl_op_kind_t
+{
+  stateless = 0,
+  stateful  = 1
+};
+
+struct cccl_op_t
+{
+  cccl_op_kind_t type;
+  const char* name;
+  const char* ltoir;
+  int ltoir_size;
+  int size;
+  int alignment;
+  void* state;
+};
+
+enum class cccl_iterator_kind_t
+{
+  pointer  = 0,
+  iterator = 1
+};
+
+struct cccl_value_t
+{
+  cccl_type_info type;
+  void* state;
+};
+
+struct cccl_iterator_t
+{
+  int size;
+  int alignment;
+  cccl_iterator_kind_t type;
+  cccl_op_t advance;
+  cccl_op_t dereference;
+  cccl_type_info value_type;
+  void* state;
+};
+
+#endif // CCCL_C_EXPERIMENTAL
diff --git a/c/src/reduce.cu b/c/src/reduce.cu
new file mode 100644
index 0000000000..4badcd1ff0
--- /dev/null
+++ b/c/src/reduce.cu
@@ -0,0 +1,864 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/util_device.cuh>
+
+#include <cuda/std/cstdint>
+#include <cuda/std/functional>
+
+#include <format>
+#include <iostream>
+#include <memory>
+
+#include <cccl/reduce.h>
+#include <nvJitLink.h>
+#include <nvrtc.h>
+
+void check(nvrtcResult result)
+{
+  if (result != NVRTC_SUCCESS)
+  {
+    throw std::runtime_error(std::string("NVRTC error: ") + nvrtcGetErrorString(result));
+  }
+}
+
+void check(CUresult result)
+{
+  if (result != CUDA_SUCCESS)
+  {
+    const char* str = nullptr;
+    cuGetErrorString(result, &str);
+    throw std::runtime_error(std::string("CUDA error: ") + str);
+  }
+}
+
+void check(nvJitLinkResult result)
+{
+  if (result != NVJITLINK_SUCCESS)
+  {
+    throw std::runtime_error(std::string("nvJitLink error: ") + std::to_string(result));
+  }
+}
+
+struct op_wrapper;
+struct device_reduce_policy;
+using TransformOpT = ::cuda::std::__identity;
+using OffsetT      = unsigned long long;
+static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be size_t");
+
+struct nothing_t
+{};
+
+struct runtime_tuning_policy
+{
+  int block_size;
+  int items_per_thread;
+  int vector_load_length;
+};
+
+struct storage_t;
+struct input_iterator_state_t;
+struct output_iterator_t;
+
+char const* cccl_type_enum_to_string(cccl_type_enum type)
+{
+  switch (type)
+  {
+    case cccl_type_enum::INT8:
+      return "::cuda::std::int8_t";
+    case cccl_type_enum::INT16:
+      return "::cuda::std::int16_t";
+    case cccl_type_enum::INT32:
+      return "::cuda::std::int32_t";
+    case cccl_type_enum::INT64:
+      return "::cuda::std::int64_t";
+    case cccl_type_enum::UINT8:
+      return "::cuda::std::uint8_t";
+    case cccl_type_enum::UINT16:
+      return "::cuda::std::uint16_t";
+    case cccl_type_enum::UINT32:
+      return "::cuda::std::uint32_t";
+    case cccl_type_enum::UINT64:
+      return "::cuda::std::uint64_t";
+    case cccl_type_enum::FLOAT32:
+      return "float";
+    case cccl_type_enum::FLOAT64:
+      return "double";
+    case cccl_type_enum::STORAGE:
+      return "storage_t";
+  }
+  return "unknown";
+}
+
+std::string cccl_type_enum_to_name(cccl_type_enum type, bool is_pointer = false)
+{
+  std::string result;
+
+  if (is_pointer)
+  {
+    switch (type)
+    {
+      case cccl_type_enum::INT8:
+
+        check(nvrtcGetTypeName<::cuda::std::int8_t*>(&result));
+        break;
+      case cccl_type_enum::INT16:
+        check(nvrtcGetTypeName<::cuda::std::int16_t*>(&result));
+        break;
+      case cccl_type_enum::INT32:
+        check(nvrtcGetTypeName<::cuda::std::int32_t*>(&result));
+        break;
+      case cccl_type_enum::INT64:
+        check(nvrtcGetTypeName<::cuda::std::int64_t*>(&result));
+        break;
+      case cccl_type_enum::UINT8:
+        check(nvrtcGetTypeName<::cuda::std::uint8_t*>(&result));
+        break;
+      case cccl_type_enum::UINT16:
+        check(nvrtcGetTypeName<::cuda::std::uint16_t*>(&result));
+        break;
+      case cccl_type_enum::UINT32:
+        check(nvrtcGetTypeName<::cuda::std::uint32_t*>(&result));
+        break;
+      case cccl_type_enum::UINT64:
+        check(nvrtcGetTypeName<::cuda::std::uint64_t*>(&result));
+        break;
+      case cccl_type_enum::FLOAT32:
+        check(nvrtcGetTypeName<float*>(&result));
+        break;
+      case cccl_type_enum::FLOAT64:
+        check(nvrtcGetTypeName<double*>(&result));
+        break;
+      case cccl_type_enum::STORAGE:
+        check(nvrtcGetTypeName<storage_t*>(&result));
+        break;
+    }
+  }
+  else
+  {
+    switch (type)
+    {
+      case cccl_type_enum::INT8:
+        check(nvrtcGetTypeName<::cuda::std::int8_t>(&result));
+        break;
+      case cccl_type_enum::INT16:
+        check(nvrtcGetTypeName<::cuda::std::int16_t>(&result));
+        break;
+      case cccl_type_enum::INT32:
+        check(nvrtcGetTypeName<::cuda::std::int32_t>(&result));
+        break;
+      case cccl_type_enum::INT64:
+        check(nvrtcGetTypeName<::cuda::std::int64_t>(&result));
+        break;
+      case cccl_type_enum::UINT8:
+        check(nvrtcGetTypeName<::cuda::std::uint8_t>(&result));
+        break;
+      case cccl_type_enum::UINT16:
+        check(nvrtcGetTypeName<::cuda::std::uint16_t>(&result));
+        break;
+      case cccl_type_enum::UINT32:
+        check(nvrtcGetTypeName<::cuda::std::uint32_t>(&result));
+        break;
+      case cccl_type_enum::UINT64:
+        check(nvrtcGetTypeName<::cuda::std::uint64_t>(&result));
+        break;
+      case cccl_type_enum::FLOAT32:
+        check(nvrtcGetTypeName<float>(&result));
+        break;
+      case cccl_type_enum::FLOAT64:
+        check(nvrtcGetTypeName<double>(&result));
+        break;
+      case cccl_type_enum::STORAGE:
+        check(nvrtcGetTypeName<storage_t>(&result));
+        break;
+    }
+  }
+
+  return result;
+}
+
+struct reduce_tuning_t
+{
+  int cc;
+  int block_size;
+  int items_per_thread;
+  int vector_load_length;
+};
+
+template <int N>
+reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N])
+{
+  for (const reduce_tuning_t& tuning : tunings)
+  {
+    if (cc >= tuning.cc)
+    {
+      return tuning;
+    }
+  }
+
+  return tunings[N - 1];
+}
+
+runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info input_type)
+{
+  reduce_tuning_t chain[] = {{60, 256, 16, 4}, {35, 256, 20, 4}};
+
+  auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);
+
+  // Implement part of MemBoundScaling
+  items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
+  block_size       = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
+
+  return {block_size, items_per_thread, vector_load_length};
+}
+
+cccl_type_info get_accumulator_type(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init)
+{
+  // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
+  //      so switching back to the old accumulator type logic for now
+  return init.type;
+}
+
+cudaError_t InvokeSingleTile(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc,
+  CUkernel single_tile_kernel,
+  CUstream stream)
+{
+  const runtime_tuning_policy policy = get_policy(cc, d_in.value_type, d_in.value_type);
+
+  cudaError error = cudaSuccess;
+  do
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      break;
+    }
+
+    nothing_t nothing{};
+    TransformOpT transform_op{};
+    void* op_state = op.type == cccl_op_kind_t::stateless ? &nothing : op.state;
+    void* in_ptr   = d_in.type == cccl_iterator_kind_t::pointer ? &d_in.state : d_in.state;
+    void* out_ptr  = d_out.type == cccl_iterator_kind_t::pointer ? &d_out.state : d_out.state;
+    void* args[]   = {in_ptr, out_ptr, &num_items, op_state, init.state, &transform_op};
+
+    check(cuLaunchKernel((CUfunction) single_tile_kernel, 1, 1, 1, policy.block_size, 1, 1, 0, stream, args, 0));
+
+    // Check for failure to launch
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+  } while (0);
+
+  return error;
+}
+
+cudaError_t InvokePasses(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc,
+  CUkernel reduce_kernel,
+  CUkernel single_tile_kernel,
+  CUdevice device,
+  CUstream stream)
+{
+  const cccl_type_info accum_t       = get_accumulator_type(op, d_in, init);
+  const runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type);
+
+  cudaError error = cudaSuccess;
+  do
+  {
+    void* in_ptr  = d_in.type == cccl_iterator_kind_t::pointer ? &d_in.state : d_in.state;
+    void* out_ptr = d_out.type == cccl_iterator_kind_t::pointer ? &d_out.state : d_out.state;
+
+    // Get SM count
+    int sm_count;
+    check(cuDeviceGetAttribute(&sm_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+
+    // Init regular kernel configuration
+    const auto tile_size = policy.block_size * policy.items_per_thread;
+
+    // Older drivers have issues handling CUkernel in the occupancy queries, get the CUfunction instead.
+    // Assumes that the current device is properly set, it needs to be set for the occupancy queries anyway
+    CUfunction reduce_kernel_fn;
+    check(cuKernelGetFunction(&reduce_kernel_fn, reduce_kernel));
+
+    int sm_occupancy = 1;
+    check(cuOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, reduce_kernel_fn, policy.block_size, 0));
+
+    int reduce_device_occupancy = sm_occupancy * sm_count;
+
+    // Even-share work distribution
+    int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0);
+    cub::GridEvenShare<OffsetT> even_share;
+    even_share.DispatchInit(num_items, max_blocks, tile_size);
+
+    // Temporary storage allocation requirements
+    void* allocations[1]       = {};
+    size_t allocation_sizes[1] = {
+      max_blocks * static_cast<std::size_t>(d_in.value_type.size) // bytes needed for privatized block reductions
+    };
+
+    // Alias the temporary allocations from the single storage blob (or
+    // compute the necessary size of the blob)
+    error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+
+    if (d_temp_storage == nullptr)
+    {
+      // Return if the caller is simply requesting the size of the storage
+      // allocation
+      return cudaSuccess;
+    }
+
+    // Get grid size for device_reduce_sweep_kernel
+    OffsetT reduce_grid_size = even_share.grid_size;
+
+    // Invoke DeviceReduceKernel
+    // reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS>>>(
+    //    d_in, d_block_reductions, num_items, even_share, ReductionOpT{}, TransformOpT{});
+
+    nothing_t nothing{};
+    void* op_state = op.type == cccl_op_kind_t::stateless ? &nothing : op.state;
+
+    TransformOpT transform_op{};
+    void* reduce_args[] = {in_ptr, &allocations[0], &num_items, &even_share, op_state, &transform_op};
+
+    check(cuLaunchKernel(
+      (CUfunction) reduce_kernel, reduce_grid_size, 1, 1, policy.block_size, 1, 1, 0, stream, reduce_args, 0));
+
+    // Check for failure to launch
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+
+    // single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS>>>(
+    //     d_block_reductions, d_out, reduce_grid_size, ReductionOpT{}, 0, TransformOpT{});
+
+    void* single_tile_kernel_args[] = {&allocations[0], out_ptr, &reduce_grid_size, op_state, init.state, &transform_op};
+
+    check(cuLaunchKernel(
+      (CUfunction) single_tile_kernel, 1, 1, 1, policy.block_size, 1, 1, 0, stream, single_tile_kernel_args, 0));
+
+    // Check for failure to launch
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+  } while (0);
+
+  return error;
+}
+
+cudaError_t Invoke(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  OffsetT num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc,
+  CUkernel single_tile_kernel,
+  CUkernel single_tile_second_kernel,
+  CUkernel reduce_kernel,
+  CUdevice device,
+  CUstream stream)
+{
+  const cccl_type_info accum_t = get_accumulator_type(op, d_in, init);
+  runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type);
+
+  // Force kernel code-generation in all compiler passes
+  if (num_items <= (policy.block_size * policy.items_per_thread))
+  {
+    // Small, single tile size
+    return InvokeSingleTile(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, op, init, cc, single_tile_kernel, stream);
+  }
+  else
+  {
+    // Multi-tile pass
+    return InvokePasses(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_items,
+      op,
+      init,
+      cc,
+      reduce_kernel,
+      single_tile_second_kernel,
+      device,
+      stream);
+  }
+}
+
+std::string get_input_iterator_name()
+{
+  std::string iterator_t;
+  check(nvrtcGetTypeName<input_iterator_state_t>(&iterator_t));
+  return iterator_t;
+}
+
+std::string get_output_iterator_name()
+{
+  std::string iterator_t;
+  check(nvrtcGetTypeName<output_iterator_t>(&iterator_t));
+  return iterator_t;
+}
+
+std::string get_single_tile_kernel_name(
+  cccl_iterator_t input_it, cccl_iterator_t output_it, cccl_op_t op, cccl_value_t init, bool is_second_kernel)
+{
+  std::string chained_policy_t;
+  check(nvrtcGetTypeName<device_reduce_policy>(&chained_policy_t));
+
+  const cccl_type_info accum_t  = get_accumulator_type(op, input_it, init);
+  const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type);
+  const std::string input_iterator_t =
+    is_second_kernel ? cccl_type_enum_to_name(accum_t.type, true)
+    : input_it.type == cccl_iterator_kind_t::pointer //
+      ? cccl_type_enum_to_name(input_it.value_type.type, true) //
+      : get_input_iterator_name();
+  const std::string output_iterator_t =
+    output_it.type == cccl_iterator_kind_t::pointer //
+      ? cccl_type_enum_to_name(output_it.value_type.type, true) //
+      : get_output_iterator_name();
+  const std::string init_t = cccl_type_enum_to_name(init.type.type);
+
+  std::string offset_t;
+  check(nvrtcGetTypeName<OffsetT>(&offset_t));
+
+  std::string reduction_op_t;
+  check(nvrtcGetTypeName<op_wrapper>(&reduction_op_t));
+
+  return std::format(
+    "cub::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>",
+    chained_policy_t,
+    input_iterator_t,
+    output_iterator_t,
+    offset_t,
+    reduction_op_t,
+    init_t,
+    accum_cpp_t);
+}
+
+std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init)
+{
+  std::string chained_policy_t;
+  check(nvrtcGetTypeName<device_reduce_policy>(&chained_policy_t));
+
+  const std::string input_iterator_t =
+    input_it.type == cccl_iterator_kind_t::pointer //
+      ? cccl_type_enum_to_name(input_it.value_type.type, true) //
+      : get_input_iterator_name();
+
+  const std::string accum_t = cccl_type_enum_to_name(get_accumulator_type(op, input_it, init).type);
+
+  std::string offset_t;
+  check(nvrtcGetTypeName<OffsetT>(&offset_t));
+
+  std::string reduction_op_t;
+  check(nvrtcGetTypeName<op_wrapper>(&reduction_op_t));
+
+  std::string transform_op_t;
+  check(nvrtcGetTypeName<cuda::std::__identity>(&transform_op_t));
+
+  return std::format(
+    "cub::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>",
+    chained_policy_t,
+    input_iterator_t,
+    offset_t,
+    reduction_op_t,
+    accum_t,
+    transform_op_t);
+}
+
+bool try_push_context()
+{
+  CUcontext context = nullptr;
+
+  check(cuCtxGetCurrent(&context));
+
+  if (context == nullptr)
+  {
+    const int default_device = 0;
+    check(cuDevicePrimaryCtxRetain(&context, default_device));
+    check(cuCtxPushCurrent(context));
+
+    return true;
+  }
+
+  return false;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
+  cccl_device_reduce_build_result_t* build,
+  cccl_iterator_t input_it,
+  cccl_iterator_t output_it,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept
+{
+  CUresult error = CUDA_SUCCESS;
+
+  try
+  {
+    nvrtcProgram prog{};
+    const char* name = "test";
+
+    const int cc                       = cc_major * 10 + cc_minor;
+    const cccl_type_info accum_t       = get_accumulator_type(op, input_it, init);
+    const std::string accum_cpp        = cccl_type_enum_to_string(accum_t.type);
+    const runtime_tuning_policy policy = get_policy(cc, accum_t, input_it.value_type);
+    const std::string input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type);
+    const std::string offset_t         = cccl_type_enum_to_string(cccl_type_enum::UINT64);
+
+    const std::string input_iterator_src =
+      input_it.type == cccl_iterator_kind_t::pointer
+        ? std::string{}
+        : std::format(
+            "extern \"C\" __device__ {3} {4}(const void *self_ptr);\n"
+            "extern \"C\" __device__ void {5}(void *self_ptr, {0} offset);\n"
+            "struct __align__({2}) input_iterator_state_t {{\n;"
+            "  using iterator_category = cuda::std::random_access_iterator_tag;\n"
+            "  using value_type = {3};\n"
+            "  using difference_type = {0};\n"
+            "  using pointer = {3}*;\n"
+            "  using reference = {3}&;\n"
+            "  __device__ value_type operator*() const {{ return {4}(this); }}\n"
+            "  __device__ input_iterator_state_t& operator+=(difference_type diff) {{\n"
+            "      {5}(this, diff);\n"
+            "      return *this;\n"
+            "  }}\n"
+            "  __device__ value_type operator[](difference_type diff) const {{\n"
+            "      return *(*this + diff);\n"
+            "  }}\n"
+            "  __device__ input_iterator_state_t operator+(difference_type diff) const {{\n"
+            "      input_iterator_state_t result = *this;\n"
+            "      result += diff;\n"
+            "      return result;\n"
+            "  }}\n"
+            "  char data[{1}];\n"
+            "}};\n",
+            offset_t, // 0
+            input_it.size, // 1
+            input_it.alignment, // 2
+            input_it_value_t, // 3
+            input_it.dereference.name, // 4
+            input_it.advance.name); // 5
+
+    const std::string output_iterator_src =
+      output_it.type == cccl_iterator_kind_t::pointer
+        ? std::string{}
+        : std::format(
+            "extern \"C\" __device__ void {2}(const void *self_ptr, {1} x);\n"
+            "extern \"C\" __device__ void {3}(void *self_ptr, {0} offset);\n"
+            "struct __align__({5}) output_iterator_state_t{{\n"
+            "  char data[{4}];\n"
+            "}};\n"
+            "struct output_iterator_proxy_t {{\n"
+            "  __device__ output_iterator_proxy_t operator=({1} x) {{\n"
+            "    {2}(&state, x);\n"
+            "    return *this;\n"
+            "  }}\n"
+            "  output_iterator_state_t state;\n"
+            "}};\n"
+            "struct output_iterator_t {{\n"
+            "  using iterator_category = cuda::std::random_access_iterator_tag;\n"
+            "  using difference_type   = {0};\n"
+            "  using value_type        = void;\n"
+            "  using pointer           = output_iterator_proxy_t*;\n"
+            "  using reference         = output_iterator_proxy_t;\n"
+            "  __device__ output_iterator_proxy_t operator*() const {{ return {{state}}; }}\n"
+            "  __device__ output_iterator_t& operator+=(difference_type diff) {{\n"
+            "      {3}(&state, diff);\n"
+            "      return *this;\n"
+            "  }}\n"
+            "  __device__ output_iterator_proxy_t operator[](difference_type diff) const {{\n"
+            "    output_iterator_t result = *this;\n"
+            "    result += diff;\n"
+            "    return {{ result.state }};\n"
+            "  }}\n"
+            "  __device__ output_iterator_t operator+(difference_type diff) const {{\n"
+            "    output_iterator_t result = *this;\n"
+            "    result += diff;\n"
+            "    return result;\n"
+            "  }}\n"
+            "  output_iterator_state_t state;\n"
+            "}};",
+            offset_t, // 0
+            accum_cpp, // 1
+            output_it.dereference.name, // 2
+            output_it.advance.name, // 3
+            output_it.size, // 4
+            output_it.alignment); // 5
+
+    const std::string op_src =
+      op.type == cccl_op_kind_t::stateless
+        ? std::format(
+            "extern \"C\" __device__ {0} {1}({0} lhs, {0} rhs);\n"
+            "struct op_wrapper {{\n"
+            "  __device__ {0} operator()({0} lhs, {0} rhs) const {{\n"
+            "    return {1}(lhs, rhs);\n"
+            "  }}\n"
+            "}};\n",
+            accum_cpp,
+            op.name)
+        : std::format(
+            "struct __align__({2}) op_state {{\n"
+            "  char data[{3}];\n"
+            "}};"
+            "extern \"C\" __device__ {0} {1}(op_state *state, {0} lhs, {0} rhs);\n"
+            "struct op_wrapper {{\n"
+            "  op_state state;\n"
+            "  __device__ {0} operator()({0} lhs, {0} rhs) {{\n"
+            "    return {1}(&state, lhs, rhs);\n"
+            "  }}\n"
+            "}};\n",
+            accum_cpp,
+            op.name,
+            op.alignment,
+            op.size);
+
+    const std::string src = std::format(
+      "#include <cub/block/block_reduce.cuh>\n"
+      "#include <cub/device/dispatch/kernels/reduce.cuh>\n"
+      "struct __align__({1}) storage_t {{\n"
+      "  char data[{0}];\n"
+      "}};\n"
+      "{4}\n"
+      "{5}\n"
+      "struct agent_policy_t {{\n"
+      "  static constexpr int ITEMS_PER_THREAD = {2};\n"
+      "  static constexpr int BLOCK_THREADS = {3};\n"
+      "  static constexpr int VECTOR_LOAD_LENGTH = {7};\n"
+      "  static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM = cub::BLOCK_REDUCE_WARP_REDUCTIONS;\n"
+      "  static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_LDG;\n"
+      "}};\n"
+      "struct device_reduce_policy {{\n"
+      "  struct ActivePolicy {{\n"
+      "    using ReducePolicy = agent_policy_t;\n"
+      "    using SingleTilePolicy = agent_policy_t;\n"
+      "  }};\n"
+      "}};\n"
+      "{6};\n",
+      input_it.value_type.size, // 0
+      input_it.value_type.alignment, // 1
+      policy.items_per_thread, // 2
+      policy.block_size, // 3
+      input_iterator_src, // 4
+      output_iterator_src, // 5
+      op_src, // 6
+      policy.vector_load_length); // 7
+
+    check(nvrtcCreateProgram(&prog, src.c_str(), name, 0, nullptr, nullptr));
+
+    std::string single_tile_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, false);
+    check(nvrtcAddNameExpression(prog, single_tile_kernel_name.c_str()));
+
+    std::string single_tile_second_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, true);
+    check(nvrtcAddNameExpression(prog, single_tile_second_kernel_name.c_str()));
+
+    std::string reduction_kernel_name = get_device_reduce_kernel_name(op, input_it, init);
+    check(nvrtcAddNameExpression(prog, reduction_kernel_name.c_str()));
+
+    const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
+
+    constexpr int num_args     = 7;
+    const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
+
+    std::size_t log_size{};
+    nvrtcResult compile_result = nvrtcCompileProgram(prog, num_args, args);
+
+    check(nvrtcGetProgramLogSize(prog, &log_size));
+
+    std::unique_ptr<char[]> log{new char[log_size]};
+    check(nvrtcGetProgramLog(prog, log.get()));
+
+    if (log_size > 1)
+    {
+      std::cerr << log.get() << std::endl;
+    }
+
+    const char* single_tile_kernel_lowered_name;
+    check(nvrtcGetLoweredName(prog, single_tile_kernel_name.c_str(), &single_tile_kernel_lowered_name));
+
+    const char* single_tile_second_kernel_lowered_name;
+    check(nvrtcGetLoweredName(prog, single_tile_second_kernel_name.c_str(), &single_tile_second_kernel_lowered_name));
+
+    const char* reduction_kernel_lowered_name;
+    check(nvrtcGetLoweredName(prog, reduction_kernel_name.c_str(), &reduction_kernel_lowered_name));
+
+    // Copy lowered names to a std::unique_ptr to ensure they can be used after
+    // the program is destroyed
+
+    std::unique_ptr<char[]> single_tile_kernel_lowered_name_ptr{new char[strlen(single_tile_kernel_lowered_name) + 1]};
+    strcpy(single_tile_kernel_lowered_name_ptr.get(), single_tile_kernel_lowered_name);
+
+    std::unique_ptr<char[]> single_tile_second_kernel_lowered_name_ptr{
+      new char[strlen(single_tile_second_kernel_lowered_name) + 1]};
+    strcpy(single_tile_second_kernel_lowered_name_ptr.get(), single_tile_second_kernel_lowered_name);
+
+    std::unique_ptr<char[]> reduction_kernel_lowered_name_ptr{new char[strlen(reduction_kernel_lowered_name) + 1]};
+    strcpy(reduction_kernel_lowered_name_ptr.get(), reduction_kernel_lowered_name);
+
+    check(compile_result);
+
+    std::size_t ltoir_size{};
+    check(nvrtcGetLTOIRSize(prog, &ltoir_size));
+    std::unique_ptr<char[]> ltoir{new char[ltoir_size]};
+    check(nvrtcGetLTOIR(prog, ltoir.get()));
+    check(nvrtcDestroyProgram(&prog));
+
+    nvJitLinkHandle handle;
+    const char* lopts[] = {"-lto", arch.c_str()};
+    check(nvJitLinkCreate(&handle, 2, lopts));
+
+    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, ltoir.get(), ltoir_size, name));
+    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, op.ltoir, op.ltoir_size, name));
+
+    if (input_it.type == cccl_iterator_kind_t::iterator)
+    {
+      check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, input_it.advance.ltoir, input_it.advance.ltoir_size, name));
+      check(nvJitLinkAddData(
+        handle, NVJITLINK_INPUT_LTOIR, input_it.dereference.ltoir, input_it.dereference.ltoir_size, name));
+    }
+
+    if (output_it.type == cccl_iterator_kind_t::iterator)
+    {
+      check(
+        nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, output_it.advance.ltoir, output_it.advance.ltoir_size, name));
+      check(nvJitLinkAddData(
+        handle, NVJITLINK_INPUT_LTOIR, output_it.dereference.ltoir, output_it.dereference.ltoir_size, name));
+    }
+
+    check(nvJitLinkComplete(handle));
+
+    std::size_t cubin_size{};
+    check(nvJitLinkGetLinkedCubinSize(handle, &cubin_size));
+    std::unique_ptr<char[]> cubin{new char[cubin_size]};
+    check(nvJitLinkGetLinkedCubin(handle, cubin.get()));
+    check(nvJitLinkDestroy(&handle));
+
+    cuLibraryLoadData(&build->library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name_ptr.get()));
+    check(cuLibraryGetKernel(
+      &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name_ptr.get()));
+    check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name_ptr.get()));
+
+    build->cc         = cc;
+    build->cubin      = cubin.release();
+    build->cubin_size = cubin_size;
+  }
+  catch (...)
+  {
+    error = CUDA_ERROR_UNKNOWN;
+  }
+
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce(
+  cccl_device_reduce_build_result_t build,
+  void* d_temp_storage,
+  size_t* temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  CUstream stream) noexcept
+{
+  bool pushed    = false;
+  CUresult error = CUDA_SUCCESS;
+  try
+  {
+    pushed = try_push_context();
+
+    CUdevice cu_device;
+    check(cuCtxGetDevice(&cu_device));
+
+    Invoke(
+      d_temp_storage,
+      *temp_storage_bytes,
+      d_in,
+      d_out,
+      num_items,
+      op,
+      init,
+      build.cc,
+      build.single_tile_kernel,
+      build.single_tile_second_kernel,
+      build.reduction_kernel,
+      cu_device,
+      stream);
+  }
+  catch (...)
+  {
+    error = CUDA_ERROR_UNKNOWN;
+  }
+
+  if (pushed)
+  {
+    CUcontext dummy;
+    cuCtxPopCurrent(&dummy);
+  }
+
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr)
+{
+  try
+  {
+    if (bld_ptr == nullptr)
+    {
+      return CUDA_ERROR_INVALID_VALUE;
+    }
+
+    std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(bld_ptr->cubin));
+    check(cuLibraryUnload(bld_ptr->library));
+  }
+  catch (...)
+  {
+    return CUDA_ERROR_UNKNOWN;
+  }
+
+  return CUDA_SUCCESS;
+}
diff --git a/c/test/CMakeLists.txt b/c/test/CMakeLists.txt
new file mode 100644
index 0000000000..6a8599500e
--- /dev/null
+++ b/c/test/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(cccl.c.test.reduce test_reduce.cpp test_main.cpp)
+
+target_link_libraries(cccl.c.test.reduce PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2)
+
+target_compile_definitions(cccl.c.test.reduce PRIVATE CCCL_C_EXPERIMENTAL
+                                                     TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
+                                                     TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub"
+                                                     TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
+                                                     TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}")
diff --git a/c/test/c2h.h b/c/test/c2h.h
new file mode 100644
index 0000000000..e2b26895a8
--- /dev/null
+++ b/c/test/c2h.h
@@ -0,0 +1,310 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <catch2/catch.hpp>
+#include <cccl/reduce.h>
+#include <nvrtc.h>
+
+static std::string inspect_sass(const void* cubin, size_t cubin_size)
+{
+  namespace fs = std::filesystem;
+
+  fs::path temp_dir = fs::temp_directory_path();
+
+  fs::path temp_in_filename  = temp_dir / "temp_in_file.cubin";
+  fs::path temp_out_filename = temp_dir / "temp_out_file.sass";
+
+  std::ofstream temp_in_file(temp_in_filename, std::ios::binary);
+  if (!temp_in_file)
+  {
+    throw std::runtime_error("Failed to create temporary file.");
+  }
+
+  temp_in_file.write(static_cast<const char*>(cubin), cubin_size);
+  temp_in_file.close();
+
+  std::string command = "nvdisasm -gi ";
+  command += temp_in_filename;
+  command += " > ";
+  command += temp_out_filename;
+
+  if (std::system(command.c_str()) != 0)
+  {
+    throw std::runtime_error("Failed to execute command.");
+  }
+
+  if (!fs::remove(temp_in_filename))
+  {
+    throw std::runtime_error("Failed to remove temporary file.");
+  }
+
+  std::ifstream temp_out_file(temp_out_filename, std::ios::binary);
+  if (!temp_out_file)
+  {
+    throw std::runtime_error("Failed to create temporary file.");
+  }
+
+  const std::string sass{std::istreambuf_iterator<char>(temp_out_file), std::istreambuf_iterator<char>()};
+  if (!fs::remove(temp_out_filename))
+  {
+    throw std::runtime_error("Failed to remove temporary file.");
+  }
+
+  return sass;
+}
+
+static std::string compile(const std::string& source)
+{
+  // compile source to LTO-IR using nvrtc
+
+  nvrtcProgram prog;
+  REQUIRE(NVRTC_SUCCESS == nvrtcCreateProgram(&prog, source.c_str(), "op.cu", 0, nullptr, nullptr));
+
+  const char* options[] = {"--std=c++17", "-rdc=true", "-dlto"};
+  REQUIRE(NVRTC_SUCCESS == nvrtcCompileProgram(prog, 3, options));
+
+  std::size_t ltoir_size{};
+  REQUIRE(NVRTC_SUCCESS == nvrtcGetLTOIRSize(prog, &ltoir_size));
+
+  std::unique_ptr<char[]> ltoir(new char[ltoir_size]);
+
+  REQUIRE(NVRTC_SUCCESS == nvrtcGetLTOIR(prog, ltoir.get()));
+  REQUIRE(NVRTC_SUCCESS == nvrtcDestroyProgram(&prog));
+
+  return std::string(ltoir.release(), ltoir_size);
+}
+
+template <class T>
+std::vector<T> generate(std::size_t num_items)
+{
+  std::random_device rnd_device;
+  std::mt19937 mersenne_engine{rnd_device()}; // Generates random integers
+  std::uniform_int_distribution<T> dist{T{1}, T{42}};
+  std::vector<T> vec(num_items);
+  std::generate(vec.begin(), vec.end(), [&]() {
+    return dist(mersenne_engine);
+  });
+  return vec;
+}
+
+template <class T>
+cccl_type_info get_type_info()
+{
+  cccl_type_info info;
+  info.size      = sizeof(T);
+  info.alignment = alignof(T);
+
+  if constexpr (std::is_same_v<T, char>)
+  {
+    info.type = cccl_type_enum::INT8;
+  }
+  else if constexpr (std::is_same_v<T, int32_t>)
+  {
+    info.type = cccl_type_enum::INT32;
+  }
+  else if constexpr (std::is_same_v<T, uint32_t>)
+  {
+    info.type = cccl_type_enum::UINT32;
+  }
+  else if constexpr (std::is_same_v<T, int64_t>)
+  {
+    info.type = cccl_type_enum::INT64;
+  }
+  else if constexpr (std::is_same_v<T, uint64_t>)
+  {
+    info.type = cccl_type_enum::UINT64;
+  }
+  else if constexpr (!std::is_integral_v<T>)
+  {
+    info.type = cccl_type_enum::STORAGE;
+  }
+  else
+  {
+    static_assert(false, "Unsupported type");
+  }
+
+  return info;
+}
+
+static std::string get_op(cccl_type_enum t)
+{
+  switch (t)
+  {
+    case cccl_type_enum::INT8:
+      return "extern \"C\" __device__ char op(char a, char b) { return a + b; }";
+    case cccl_type_enum::INT32:
+      return "extern \"C\" __device__ int op(int a, int b) { return a + b; }";
+    case cccl_type_enum::UINT32:
+      return "extern \"C\" __device__ unsigned int op(unsigned int a, unsigned int b) { return a + b; }";
+    case cccl_type_enum::INT64:
+      return "extern \"C\" __device__ long long op(long long a, long long b) { return a + b; }";
+    case cccl_type_enum::UINT64:
+      return "extern \"C\" __device__ unsigned long long op(unsigned long long a, unsigned long long b) { "
+             " return a + b; "
+             "}";
+    default:
+      throw std::runtime_error("Unsupported type");
+  }
+  return "";
+}
+
+template <class T>
+struct pointer_t
+{
+  T* ptr{};
+
+  pointer_t(int num_items)
+  {
+    REQUIRE(cudaSuccess == cudaMalloc(&ptr, num_items * sizeof(T)));
+  }
+
+  pointer_t(const std::vector<T>& vec)
+  {
+    REQUIRE(cudaSuccess == cudaMalloc(&ptr, vec.size() * sizeof(T)));
+    REQUIRE(cudaSuccess == cudaMemcpy(ptr, vec.data(), vec.size() * sizeof(T), cudaMemcpyHostToDevice));
+  }
+
+  ~pointer_t()
+  {
+    if (ptr)
+    {
+      REQUIRE(cudaSuccess == cudaFree(ptr));
+      ptr = nullptr;
+    }
+  }
+
+  T operator[](int i) const
+  {
+    T value{};
+    REQUIRE(cudaSuccess == cudaMemcpy(&value, ptr + i, sizeof(T), cudaMemcpyDeviceToHost));
+    return value;
+  }
+
+  operator cccl_iterator_t()
+  {
+    cccl_iterator_t it;
+    it.size       = sizeof(T);
+    it.alignment  = alignof(T);
+    it.type       = cccl_iterator_kind_t::pointer;
+    it.state      = ptr;
+    it.value_type = get_type_info<T>();
+    return it;
+  }
+};
+
+struct operation_t
+{
+  std::string name;
+  std::string code;
+
+  operator cccl_op_t()
+  {
+    cccl_op_t op;
+    op.type       = cccl_op_kind_t::stateless;
+    op.name       = name.c_str();
+    op.ltoir      = code.c_str();
+    op.ltoir_size = code.size();
+    return op;
+  }
+};
+
+template <class OpT>
+struct stateful_operation_t
+{
+  OpT op_state;
+  std::string name;
+  std::string code;
+
+  operator cccl_op_t()
+  {
+    cccl_op_t op;
+    op.type       = cccl_op_kind_t::stateful;
+    op.size       = sizeof(OpT);
+    op.alignment  = alignof(OpT);
+    op.state      = &op_state;
+    op.name       = name.c_str();
+    op.ltoir      = code.c_str();
+    op.ltoir_size = code.size();
+    return op;
+  }
+};
+
+static operation_t make_operation(std::string name, std::string code)
+{
+  return operation_t{name, compile(code)};
+}
+
+template <class OpT>
+static stateful_operation_t<OpT> make_operation(std::string name, std::string code, OpT op)
+{
+  return {op, name, compile(code)};
+}
+
+template <class ValueT, class StateT>
+struct iterator_t
+{
+  StateT state;
+  operation_t advance;
+  operation_t dereference;
+
+  operator cccl_iterator_t()
+  {
+    cccl_iterator_t it;
+    it.size        = sizeof(StateT);
+    it.alignment   = alignof(StateT);
+    it.type        = cccl_iterator_kind_t::iterator;
+    it.advance     = advance;
+    it.dereference = dereference;
+    it.value_type  = get_type_info<ValueT>();
+    it.state       = &state;
+    return it;
+  }
+};
+
+template <class ValueT, class StateT>
+iterator_t<ValueT, StateT> make_iterator(std::string state, operation_t advance, operation_t dereference)
+{
+  iterator_t<ValueT, StateT> it;
+  it.advance     = make_operation(advance.name, state + advance.code);
+  it.dereference = make_operation(dereference.name, state + dereference.code);
+  return it;
+}
+
+template <class T>
+struct value_t
+{
+  T value;
+
+  value_t(T value)
+      : value(value)
+  {}
+
+  operator cccl_value_t()
+  {
+    cccl_value_t v;
+    v.type  = get_type_info<T>();
+    v.state = &value;
+    return v;
+  }
+};
diff --git a/c/test/test_main.cpp b/c/test/test_main.cpp
new file mode 100644
index 0000000000..3e3b4900a5
--- /dev/null
+++ b/c/test/test_main.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#define CATCH_CONFIG_RUNNER
+#include <catch2/catch.hpp>
+
+int device_guard(int device_id)
+{
+  int device_count{};
+  if (cudaGetDeviceCount(&device_count) != cudaSuccess)
+  {
+    std::cerr << "Can't query devices number." << std::endl;
+    std::exit(-1);
+  }
+
+  if (device_id >= device_count || device_id < 0)
+  {
+    std::cerr << "Invalid device ID: " << device_id << std::endl;
+    std::exit(-1);
+  }
+
+  return device_id;
+}
+
+int main(int argc, char* argv[])
+{
+  Catch::Session session;
+
+  int device_id{};
+
+  // Build a new parser on top of Catch's
+  using namespace Catch::clara;
+  auto cli = session.cli() | Opt(device_id, "device")["-d"]["--device"]("device id to use");
+  session.cli(cli);
+
+  int returnCode = session.applyCommandLine(argc, argv);
+  if (returnCode != 0)
+  {
+    return returnCode;
+  }
+
+  cudaSetDevice(device_guard(device_id));
+  return session.run(argc, argv);
+}
diff --git a/c/test/test_reduce.cpp b/c/test/test_reduce.cpp
new file mode 100644
index 0000000000..1a4607702a
--- /dev/null
+++ b/c/test/test_reduce.cpp
@@ -0,0 +1,285 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include "c2h.h"
+
+void reduce(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_items, cccl_op_t op, cccl_value_t init)
+{
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, 0);
+
+  const int cc_major = deviceProp.major;
+  const int cc_minor = deviceProp.minor;
+
+  const char* cub_path        = TEST_CUB_PATH;
+  const char* thrust_path     = TEST_THRUST_PATH;
+  const char* libcudacxx_path = TEST_LIBCUDACXX_PATH;
+  const char* ctk_path        = TEST_CTK_PATH;
+
+  cccl_device_reduce_build_result_t build;
+  REQUIRE(CUDA_SUCCESS
+          == cccl_device_reduce_build(
+            &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path));
+
+  const std::string sass = inspect_sass(build.cubin, build.cubin_size);
+  REQUIRE(sass.find("LDL") == std::string::npos);
+  REQUIRE(sass.find("STL") == std::string::npos);
+
+  size_t temp_storage_bytes = 0;
+  REQUIRE(
+    CUDA_SUCCESS == cccl_device_reduce(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0));
+
+  pointer_t<uint8_t> temp_storage(temp_storage_bytes);
+
+  REQUIRE(CUDA_SUCCESS
+          == cccl_device_reduce(build, temp_storage.ptr, &temp_storage_bytes, input, output, num_items, op, init, 0));
+  REQUIRE(CUDA_SUCCESS == cccl_device_reduce_cleanup(&build));
+}
+
+using integral_types = std::tuple<int32_t, uint32_t, int64_t, uint64_t>;
+TEMPLATE_LIST_TEST_CASE("Reduce works with integral types", "[reduce]", integral_types)
+{
+  const int num_items               = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+  operation_t op                    = make_operation("op", get_op(get_type_info<TestType>().type));
+  const std::vector<TestType> input = generate<TestType>(num_items);
+  pointer_t<TestType> input_ptr(input);
+  pointer_t<TestType> output_ptr(1);
+  value_t<TestType> init{TestType{42}};
+
+  reduce(input_ptr, output_ptr, num_items, op, init);
+
+  const TestType output   = output_ptr[0];
+  const TestType expected = std::accumulate(input.begin(), input.end(), init.value);
+  REQUIRE(output == expected);
+}
+
+struct pair
+{
+  short a;
+  size_t b;
+};
+
+TEST_CASE("Reduce works with custom types", "[reduce]")
+{
+  const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+
+  operation_t op = make_operation(
+    "op",
+    "struct pair { short a; size_t b; };\n"
+    "extern \"C\" __device__ pair op(pair lhs, pair rhs) {\n"
+    "  return pair{ lhs.a + rhs.a, lhs.b + rhs.b };\n"
+    "}");
+  const std::vector<short> a  = generate<short>(num_items);
+  const std::vector<size_t> b = generate<size_t>(num_items);
+  std::vector<pair> input(num_items);
+  for (std::size_t i = 0; i < num_items; ++i)
+  {
+    input[i] = pair{a[i], b[i]};
+  }
+  pointer_t<pair> input_ptr(input);
+  pointer_t<pair> output_ptr(1);
+  value_t<pair> init{pair{4, 2}};
+
+  reduce(input_ptr, output_ptr, num_items, op, init);
+
+  const pair output   = output_ptr[0];
+  const pair expected = std::accumulate(input.begin(), input.end(), init.value, [](const pair& lhs, const pair& rhs) {
+    return pair{short(lhs.a + rhs.a), lhs.b + rhs.b};
+  });
+  REQUIRE(output.a == expected.a);
+  REQUIRE(output.b == expected.b);
+}
+
+struct counting_iterator_state_t
+{
+  int value;
+};
+
+TEST_CASE("Reduce works with input iterators", "[reduce]")
+{
+  const std::size_t num_items                         = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op                                      = make_operation("op", get_op(get_type_info<int>().type));
+  iterator_t<int, counting_iterator_state_t> input_it = make_iterator<int, counting_iterator_state_t>(
+    "struct counting_iterator_state_t { int value; };\n",
+    {"advance",
+     "extern \"C\" __device__ void advance(counting_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->value += offset;\n"
+     "}"},
+    {"dereference",
+     "extern \"C\" __device__ int dereference(counting_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value = 0;
+  pointer_t<int> output_it(1);
+  value_t<int> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const int output   = output_it[0];
+  const int expected = init.value + num_items * (num_items - 1) / 2;
+  REQUIRE(output == expected);
+}
+
+struct transform_output_iterator_state_t
+{
+  int* d_output;
+};
+
+TEST_CASE("Reduce works with output iterators", "[reduce]")
+{
+  const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op      = make_operation("op", get_op(get_type_info<int>().type));
+  iterator_t<int, transform_output_iterator_state_t> output_it = make_iterator<int, transform_output_iterator_state_t>(
+    "struct transform_output_iterator_state_t { int* d_output; };\n",
+    {"advance",
+     "extern \"C\" __device__ void advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->d_output += offset;\n"
+     "}"},
+    {"dereference",
+     "extern \"C\" __device__ void dereference(transform_output_iterator_state_t* state, int x) { \n"
+     "  *state->d_output = 2 * x;\n"
+     "}"});
+  const std::vector<int> input = generate<int>(num_items);
+  pointer_t<int> input_it(input);
+  pointer_t<int> inner_output_it(1);
+  output_it.state.d_output = inner_output_it.ptr;
+  value_t<int> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const int output   = inner_output_it[0];
+  const int expected = std::accumulate(input.begin(), input.end(), init.value);
+  REQUIRE(output == expected * 2);
+}
+
+template <class T>
+struct constant_iterator_state_t
+{
+  T value;
+};
+
+TEST_CASE("Reduce works with input and output iterators", "[reduce]")
+{
+  const int num_items                                      = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op                                           = make_operation("op", get_op(get_type_info<int>().type));
+  iterator_t<int, constant_iterator_state_t<int>> input_it = make_iterator<int, constant_iterator_state_t<int>>(
+    "struct constant_iterator_state_t { int value; };\n",
+    {"in_advance",
+     "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n"
+     "}"},
+    {"in_dereference",
+     "extern \"C\" __device__ int in_dereference(constant_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value                                         = 1;
+  iterator_t<int, transform_output_iterator_state_t> output_it = make_iterator<int, transform_output_iterator_state_t>(
+    "struct transform_output_iterator_state_t { int* d_output; };\n",
+    {"out_advance",
+     "extern \"C\" __device__ void out_advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->d_output += offset;\n"
+     "}"},
+    {"out_dereference",
+     "extern \"C\" __device__ void out_dereference(transform_output_iterator_state_t* state, int x) { \n"
+     "  *state->d_output = 2 * x;\n"
+     "}"});
+  pointer_t<int> inner_output_it(1);
+  output_it.state.d_output = inner_output_it.ptr;
+  value_t<int> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const int output   = inner_output_it[0];
+  const int expected = 2 * (init.value + num_items);
+  REQUIRE(output == expected);
+}
+
+TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]")
+{
+  const int num_items = 1 << 14; // 16384 > 128
+  operation_t op      = make_operation("op", get_op(get_type_info<size_t>().type));
+  iterator_t<char, constant_iterator_state_t<char>> input_it = make_iterator<char, constant_iterator_state_t<char>>(
+    "struct constant_iterator_state_t { char value; };\n",
+    {"in_advance",
+     "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n"
+     "}"},
+    {"in_dereference",
+     "extern \"C\" __device__ char in_dereference(constant_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value = 1;
+  pointer_t<size_t> output_it(1);
+  value_t<size_t> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const size_t output = output_it[0];
+  const int expected  = init.value + num_items;
+  REQUIRE(output == expected);
+}
+
+TEST_CASE("Reduce works with large inputs", "[reduce]")
+{
+  const size_t num_items = 1ull << 33;
+  operation_t op         = make_operation("op", get_op(get_type_info<size_t>().type));
+  iterator_t<char, constant_iterator_state_t<char>> input_it = make_iterator<char, constant_iterator_state_t<char>>(
+    "struct constant_iterator_state_t { char value; };\n",
+    {"in_advance",
+     "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n"
+     "}"},
+    {"in_dereference",
+     "extern \"C\" __device__ char in_dereference(constant_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value = 1;
+  pointer_t<size_t> output_it(1);
+  value_t<size_t> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const size_t output   = output_it[0];
+  const size_t expected = init.value + num_items;
+  REQUIRE(output == expected);
+}
+
+struct invocation_counter_state_t
+{
+  int* d_counter;
+};
+
+TEST_CASE("Reduce works with stateful operators", "[reduce]")
+{
+  const int num_items = 1 << 12;
+  pointer_t<int> counter(1);
+  stateful_operation_t<invocation_counter_state_t> op = make_operation(
+    "op",
+    "struct invocation_counter_state_t { int* d_counter; };\n"
+    "extern \"C\" __device__ int op(invocation_counter_state_t *state, int a, int b) {\n"
+    "  atomicAdd(state->d_counter, 1);\n"
+    "  return a + b;\n"
+    "}",
+    invocation_counter_state_t{counter.ptr});
+
+  const std::vector<int> input = generate<int>(num_items);
+  pointer_t<int> input_ptr(input);
+  pointer_t<int> output_ptr(1);
+  value_t<int> init{42};
+
+  reduce(input_ptr, output_ptr, num_items, op, init);
+
+  const int invocation_count          = counter[0];
+  const int expected_invocation_count = num_items - 1;
+  REQUIRE(invocation_count > expected_invocation_count);
+
+  const int output   = output_ptr[0];
+  const int expected = std::accumulate(input.begin(), input.end(), init.value);
+  REQUIRE(output == expected);
+}
diff --git a/ci/build_common.sh b/ci/build_common.sh
index e0bfc05c3b..1f5655d671 100755
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -13,6 +13,7 @@ CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc`
 CUDA_ARCHS= # Empty, use presets by default.
 GLOBAL_CMAKE_OPTIONS=()
 DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks.
+CONFIGURE_ONLY=false
 
 # Check if the correct number of arguments has been provided
 function usage {
@@ -21,7 +22,8 @@ function usage {
     echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
     echo
     echo "Options:"
-    echo "  -v/--verbose: enable shell echo for debugging"
+    echo "  -v/-verbose: enable shell echo for debugging"
+    echo "  -configure: Only run cmake to configure, do not build or test."
     echo "  -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)"
     echo "  -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)"
     echo "  -std: CUDA/C++ standard (Defaults to 17)"
@@ -32,6 +34,7 @@ function usage {
     echo "  $ PARALLEL_LEVEL=8 $0"
     echo "  $ PARALLEL_LEVEL=8 $0 -cxx g++-9"
     echo "  $ $0 -cxx clang++-8"
+    echo "  $ $0 -configure -arch=80"
     echo "  $ $0 -cxx g++-8 -std 14 -arch 80-real -v -cuda /usr/local/bin/nvcc"
     echo "  $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\""
     exit 1
@@ -44,7 +47,8 @@ function usage {
 args=("$@")
 while [ "${#args[@]}" -ne 0 ]; do
     case "${args[0]}" in
-    -v | --verbose) VERBOSE=1; args=("${args[@]:1}");;
+    -v | --verbose | -verbose) VERBOSE=1; args=("${args[@]:1}");;
+    -configure) CONFIGURE_ONLY=true;   args=("${args[@]:1}");;
     -cxx)  HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");;
     -std)  CXX_STANDARD="${args[1]}";  args=("${args[@]:2}");;
     -cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");;
@@ -186,6 +190,16 @@ function configure_preset()
     run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE $CMAKE_OPTIONS "${GLOBAL_CMAKE_OPTIONS[@]}"
     status=$?
     popd > /dev/null
+
+    if $CONFIGURE_ONLY; then
+        echo "${BUILD_NAME} configuration complete:"
+        echo "  Exit code:       ${status}"
+        echo "  CMake Preset:    ${PRESET}"
+        echo "  CMake Options:   ${CMAKE_OPTIONS}"
+        echo "  Build Directory: ${BUILD_DIR}/${PRESET}"
+        exit $status
+    fi
+
     return $status
 }
 
@@ -196,6 +210,10 @@ function build_preset() {
     local red="1;31"
     local GROUP_NAME="🏗️  Build ${BUILD_NAME}"
 
+    if $CONFIGURE_ONLY; then
+        return 0
+    fi
+
     local preset_dir="${BUILD_DIR}/${PRESET}"
     local sccache_json="${preset_dir}/sccache_stats.json"
 
@@ -239,6 +257,10 @@ function test_preset()
     local PRESET=$2
     local GPU_REQUIRED=${3:-true}
 
+    if $CONFIGURE_ONLY; then
+        return 0
+    fi
+
     if $GPU_REQUIRED; then
         fail_if_no_gpu
     fi
@@ -265,5 +287,8 @@ function configure_and_build_preset()
     local CMAKE_OPTIONS=$3
 
     configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS"
-    build_preset "$BUILD_NAME" "$PRESET"
+
+    if ! $CONFIGURE_ONLY; then
+        build_preset "$BUILD_NAME" "$PRESET"
+    fi
 }
diff --git a/ci/build_cub.sh b/ci/build_cub.sh
index 73236170a6..ce658bf66c 100755
--- a/ci/build_cub.sh
+++ b/ci/build_cub.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -euo pipefail
+
 source "$(dirname "$0")/build_common.sh"
 
 print_environment_details
diff --git a/ci/build_cudax.sh b/ci/build_cudax.sh
index 657372b191..2dff254972 100755
--- a/ci/build_cudax.sh
+++ b/ci/build_cudax.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -euo pipefail
+
 source "$(dirname "$0")/build_common.sh"
 
 print_environment_details
diff --git a/ci/build_libcudacxx.sh b/ci/build_libcudacxx.sh
index 1dc26f3228..1f6925b0fa 100755
--- a/ci/build_libcudacxx.sh
+++ b/ci/build_libcudacxx.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -euo pipefail
+
 source "$(dirname "$0")/build_common.sh"
 
 print_environment_details
diff --git a/ci/build_thrust.sh b/ci/build_thrust.sh
index 6e4a82da0f..6c9281c9c4 100755
--- a/ci/build_thrust.sh
+++ b/ci/build_thrust.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -euo pipefail
+
 source "$(dirname "$0")/build_common.sh"
 
 print_environment_details
diff --git a/ci/inspect_changes.sh b/ci/inspect_changes.sh
index 22e9cf492d..72c37ba9c5 100755
--- a/ci/inspect_changes.sh
+++ b/ci/inspect_changes.sh
@@ -27,6 +27,7 @@ subprojects=(
   thrust
   cudax
   pycuda
+  c
 )
 
 # ...and their dependencies:
@@ -36,7 +37,8 @@ declare -A dependencies=(
   [cub]="cccl libcudacxx thrust"
   [thrust]="cccl libcudacxx cub"
   [cudax]="cccl libcudacxx"
-  [pycuda]="cccl libcudacxx cub thrust cudax"
+  [pycuda]="cccl libcudacxx cub thrust c"
+  [c]="cccl libcudacxx cub"
 )
 
 declare -A project_names=(
@@ -46,13 +48,14 @@ declare -A project_names=(
   [thrust]="Thrust"
   [cudax]="CUDA Experimental"
   [pycuda]="pycuda"
+  [c]="CUDA C Core Library "
 )
 
 # By default, the project directory is assumed to be the same as the subproject name,
 # but can be overridden here. The `cccl` project is special, and checks for files outside
 # of any subproject directory.
 declare -A project_dirs=(
-  [pycuda]="python/cuda"
+  [pycuda]="python/cuda_cooperative"
 )
 
 # Usage checks:
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index fa2f5d9218..e3102f8487 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -15,9 +15,9 @@ workflows:
     - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'}
     # Current CTK
     - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16']}
+    - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'all', cxx: ['intel', 'msvc2019']}
-    - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang17', 'msvc2022']}
+    - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
     # Modded builds:
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc'], sm: '90a'}
@@ -30,61 +30,39 @@ workflows:
     # cudax has different CTK reqs:
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc9', 'gcc10', 'gcc11']}
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13']}
-    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang14', 'clang15']}
+    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0',       ], std: 20,    cxx: ['msvc14.36']}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 20,    cxx: ['msvc2022']}
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0'        ], std: 17,    cxx: ['gcc12'], sm: "90"}
-    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 17,    cxx: ['gcc12'], sm: "90a"}
-    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc12', 'clang16'], cpu: 'arm64'}
-    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 17,    cxx: ['intel']}
+    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 17,    cxx: ['gcc13'], sm: "90a"}
+    - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'}
     - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
     - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14']}
-    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang16']}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18']}
     # Python jobs:
-    - {jobs: ['test'], project: 'pycuda'}
+    - {jobs: ['test'], project: 'pycuda', ctk: ['12.5']}
     # cccl-infra:
     - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']}
-    - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']}
+    - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',  'clang']}
+    # Edge-case jobs
+    - {jobs: ['limited'], project: 'cub', std: 17}
 
   nightly:
-  # libcudacxx build fails, CUB tests fail:
-    - {jobs: ['build'], ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',   std: [11],     project: ['cub']}
-    - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',   std: [11],     project: ['thrust']}
-  # - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',   std: [11]      }
+    - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',    std: [11]}
+    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17]}
+    - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',   std: [17]}
+    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',    std: [14]}
+    - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',   std: 'all'}
+    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11]}
+    # H100 runners are currently flakey, only build since those use CPU-only runners:
+    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',   std: [11, 20]}
+    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang18', std: [17]}
 
-  # libcudacxx build fails, CUB tests fail:
-    - {jobs: ['build'], ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17],     project: ['cub']}
-    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17],     project: ['thrust']}
-  # - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17]      }
-
-  # CUB + libcudacxx tests fails:
-    - {jobs: ['build'], ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',  std: [17],     project: ['libcudacxx', 'cub']}
-    - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',  std: [17],     project: ['thrust']}
-  # - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',  std: [17]      }
-
-  # libcudacxx tests fail:
-    - {jobs: ['build'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',   std: [14],     project: ['libcudacxx']}
-    - {jobs: ['build'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc12',  std: 'all',    project: ['libcudacxx']}
-    - {jobs: ['build'], ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11],     project: ['libcudacxx']}
-  # H100 runners are currently flakey, only build since those use CPU-only runners:
-    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',  std: [11, 20]}
-    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang16', std: [17]}
-
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',   std: [14],     project: ['cub', 'thrust']}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc12',  std: 'all',    project: ['cub', 'thrust']}
-    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11],     project: ['cub', 'thrust']}
-   # - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',   std: [14]     }
-   # - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc12',  std: 'all'    }
-   # - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11]     }
-   # - {jobs: ['test'],  ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',  std: [11, 20] }
-   # - {jobs: ['test'],  ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang16', std: [17]     }
-
-    # nvrtc:
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4',       sm: 'gpu', cxx: 'gcc12',  std: [20],     project: ['libcudacxx']}
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc12',  std: [20],     project: ['libcudacxx']}
-    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc12',  std: 'all',    project: ['libcudacxx']}
-  # Fails on h100:
-  # - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',  std: [11, 20], project: ['libcudacxx']}
+   # nvrtc:
+    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4',       sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
+    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
+    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',  std: 'all',    project: ['libcudacxx']}
+    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc13',  std: [11, 20], project: ['libcudacxx']}
 
   # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
   exclude:
@@ -108,7 +86,8 @@ ctk_versions:
   11.1: { stds: [11, 14, 17,   ] }
   11.8: { stds: [11, 14, 17,   ] }
   12.0: { stds: [11, 14, 17, 20] }
-  12.5: { stds: [11, 14, 17, 20], aka: 'curr' }
+  12.5: { stds: [11, 14, 17, 20]}
+  12.6: { stds: [11, 14, 17, 20], aka: 'curr' }
 
 device_compilers:
   nvcc: # Version / stds are taken from CTK
@@ -146,6 +125,7 @@ host_compilers:
       15: { stds: [11, 14, 17, 20] }
       16: { stds: [11, 14, 17, 20] }
       17: { stds: [11, 14, 17, 20] }
+      18: { stds: [11, 14, 17, 20] }
   msvc:
     name: 'MSVC'
     container_tag: 'cl'
@@ -200,6 +180,9 @@ jobs:
   test_lid1:  { name: 'DeviceLaunch', gpu: true, needs: 'build', invoke: { prefix: 'test', args: '-lid1'} }
   # - captured in a CUDA graph for deferred launch (lid2):
   test_lid2:  { name: 'GraphCapture', gpu: true, needs: 'build', invoke: { prefix: 'test', args: '-lid2'} }
+  # Limited build reduces the number of runtime test cases, available device memory, etc, and may be used
+  # to reduce test runtime in limited environments.
+  limited:    { name: "SmallGMem",   gpu: true, needs: 'build', invoke: { prefix: 'test', args: '-limited'} }
 
   # Thrust:
   test_cpu: { name: 'TestCPU', gpu: false, needs: 'build', invoke: { prefix: 'test', args: '-cpu-only'} }
diff --git a/ci/pretty_printing.sh b/ci/pretty_printing.sh
index 7e02468ee4..1f76ec3981 100644
--- a/ci/pretty_printing.sh
+++ b/ci/pretty_printing.sh
@@ -97,6 +97,10 @@ function print_time_summary() {
         fi
     done
 
+    if [ "$max_length" -eq 0 ]; then
+        return
+    fi
+
     echo "Time Summary:"
     for group in "${!command_durations[@]}"; do
         printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}"
diff --git a/ci/test_cub.sh b/ci/test_cub.sh
index 9e036bd06f..59c97adfe2 100755
--- a/ci/test_cub.sh
+++ b/ci/test_cub.sh
@@ -6,10 +6,11 @@ NO_LID=false
 LID0=false
 LID1=false
 LID2=false
+LIMITED=false
 
 ci_dir=$(dirname "$0")
 
-new_args=$("${ci_dir}/util/extract_switches.sh" -no-lid -lid0 -lid1 -lid2 -- "$@")
+new_args=$("${ci_dir}/util/extract_switches.sh" -no-lid -lid0 -lid1 -lid2 -limited -- "$@")
 eval set -- ${new_args}
 while true; do
   case "$1" in
@@ -29,6 +30,10 @@ while true; do
     LID2=true
     shift
     ;;
+  -limited)
+    LIMITED=true
+    shift
+    ;;
   --)
     shift
     break
@@ -40,6 +45,21 @@ while true; do
   esac
 done
 
+if $LIMITED; then
+
+  export CCCL_SEED_COUNT_OVERRIDE=1
+  readonly device_mem_GiB=8
+  export CCCL_DEVICE_MEMORY_LIMIT=$((${device_mem_GiB} * 1024 * 1024 * 1024))
+  export CCCL_DEBUG_CHECKED_ALLOC_FAILURES=1
+
+
+  echo "Configuring limited environment:"
+  echo "  CCCL_SEED_COUNT_OVERRIDE=${CCCL_SEED_COUNT_OVERRIDE}"
+  echo "  CCCL_DEVICE_MEMORY_LIMIT=${CCCL_DEVICE_MEMORY_LIMIT} (${device_mem_GiB} GiB)"
+  echo "  CCCL_DEBUG_CHECKED_ALLOC_FAILURES=${CCCL_DEBUG_CHECKED_ALLOC_FAILURES}"
+  echo
+fi
+
 source "${ci_dir}/build_common.sh"
 
 print_environment_details
diff --git a/ci/test_pycuda.sh b/ci/test_pycuda.sh
index 742b22ec2a..bd66cc5771 100755
--- a/ci/test_pycuda.sh
+++ b/ci/test_pycuda.sh
@@ -11,10 +11,21 @@ fail_if_no_gpu
 readonly prefix="${BUILD_DIR}/python/"
 export PYTHONPATH="${prefix}:${PYTHONPATH:-}"
 
-pushd ../python/cuda >/dev/null
+pushd ../python/cuda_cooperative >/dev/null
 
-run_command "⚙️  Pip install cuda" pip install --force-reinstall --target "${prefix}" .[test]
-run_command "🚀  Pytest cuda" python -m pytest -v ./tests
+run_command "⚙️  Pip install cuda_cooperative" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
+run_command "🚀  Pytest cuda_cooperative" python -m pytest -v ./tests
+
+popd >/dev/null
+
+pushd ../python/cuda_parallel >/dev/null
+
+# Temporarily install the package twice to populate include directory as part of the first installation
+# and to let manifest discover these includes during the second installation. Do not forget to remove the
+# second installation after https://github.com/NVIDIA/cccl/issues/2281 is addressed.
+run_command "⚙️  Pip install cuda_parallel once" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
+run_command "⚙️  Pip install cuda_parallel twice" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
+run_command "🚀  Pytest cuda_parallel" python -m pytest -v ./tests
 
 popd >/dev/null
 
diff --git a/ci/update_version.sh b/ci/update_version.sh
index febf963e17..9184b98e6a 100755
--- a/ci/update_version.sh
+++ b/ci/update_version.sh
@@ -36,7 +36,8 @@ CUB_CMAKE_VERSION_FILE="cub/cub/cmake/cub-config-version.cmake"
 LIBCUDACXX_CMAKE_VERSION_FILE="libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake"
 THRUST_CMAKE_VERSION_FILE="thrust/thrust/cmake/thrust-config-version.cmake"
 CUDAX_CMAKE_VERSION_FILE="cudax/lib/cmake/cudax/cudax-config-version.cmake"
-PYCUDA_VERSION_FILE="python/cuda/cuda/cooperative/_version.py"
+CUDA_COOPERATIVE_VERSION_FILE="python/cuda_cooperative/cuda/cooperative/_version.py"
+CUDA_PARALLEL_VERSION_FILE="python/cuda_parallel/cuda/parallel/_version.py"
 
 # Calculated version codes
 new_cccl_version=$((major * 1000000 + minor * 1000 + patch))     # MMMmmmppp
@@ -102,7 +103,8 @@ update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" "
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)"
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)"
 
-update_file "$PYCUDA_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
+update_file "$CUDA_COOPERATIVE_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
+update_file "$CUDA_PARALLEL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 
 if [ "$DRY_RUN" = true ]; then
     echo "Dry run completed. No changes made."
diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt
index 1c3102d0d7..3932fffea6 100644
--- a/cub/benchmarks/CMakeLists.txt
+++ b/cub/benchmarks/CMakeLists.txt
@@ -106,7 +106,7 @@ function(add_bench_dir bench_dir)
       add_bench(bench_target ${tuning_name} "${bench_src}")
       # for convenience, make tuning variant buildable by default
       file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n")
-      target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}")
+      target_compile_options(${bench_target} PRIVATE "--extended-lambda" "-include${tuning_path}")
     else()
       # benchmarking
       register_cccl_benchmark("${bench_name}" "")
diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu
index f3b7ba3867..b6b9e4fd53 100644
--- a/cub/benchmarks/bench/radix_sort/keys.cu
+++ b/cub/benchmarks/bench/radix_sort/keys.cu
@@ -26,6 +26,7 @@
  ******************************************************************************/
 
 #include <cub/device/device_radix_sort.cuh>
+#include <cub/util_arch.cuh>
 
 #include <cuda/std/type_traits>
 
@@ -123,7 +124,7 @@ constexpr std::size_t max_temp_storage_size()
 template <typename KeyT, typename ValueT, typename OffsetT>
 constexpr bool fits_in_default_shared_memory()
 {
-  return max_temp_storage_size<KeyT, ValueT, OffsetT>() < 48 * 1024;
+  return max_temp_storage_size<KeyT, ValueT, OffsetT>() < cub::detail::max_smem_per_block;
 }
 #else // TUNE_BASE
 template <typename, typename, typename>
diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu
index 2729ce1b62..4a9f229bca 100644
--- a/cub/benchmarks/bench/radix_sort/pairs.cu
+++ b/cub/benchmarks/bench/radix_sort/pairs.cu
@@ -26,6 +26,7 @@
  ******************************************************************************/
 
 #include <cub/device/device_radix_sort.cuh>
+#include <cub/util_arch.cuh>
 
 #include <cuda/std/type_traits>
 
@@ -121,7 +122,7 @@ constexpr std::size_t max_temp_storage_size()
 template <typename KeyT, typename ValueT, typename OffsetT>
 constexpr bool fits_in_default_shared_memory()
 {
-  return max_temp_storage_size<KeyT, ValueT, OffsetT>() < 48 * 1024;
+  return max_temp_storage_size<KeyT, ValueT, OffsetT>() < cub::detail::max_smem_per_block;
 }
 #else // TUNE_BASE
 template <typename, typename, typename>
diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu
new file mode 100644
index 0000000000..a6c149ffdd
--- /dev/null
+++ b/cub/benchmarks/bench/reduce/min.cu
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+// NOTE: this benchmark is intented to cover DPX instructions on Hopper+ architectures.
+//       It specifically uses cub::Min instead of a user-defined operator.
+#define TUNE_T int16_t
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
+
+using op_t = cub::Min;
+#include "base.cuh"
diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
index 65b760fba2..e3cd7a7be8 100644
--- a/cub/benchmarks/bench/scan/exclusive/base.cuh
+++ b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -27,6 +27,8 @@
 
 #include <cub/device/device_scan.cuh>
 
+#include <cuda/std/__functional/invoke.h>
+
 #include <look_back_helper.cuh>
 
 #if !TUNE_BASE
@@ -85,7 +87,7 @@ template <typename T, typename OffsetT>
 static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {
   using init_t      = cub::detail::InputValue<T>;
-  using accum_t     = cub::detail::accumulator_t<op_t, T, T>;
+  using accum_t     = ::cuda::std::__accumulator_t<op_t, T, T>;
   using input_it_t  = const T*;
   using output_it_t = T*;
   using offset_t    = OffsetT;
@@ -129,7 +131,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-using some_offset_types = nvbench::type_list<nvbench::int32_t>;
+using some_offset_types = nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>;
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types))
   .set_name("base")
diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu
index 3830ad7764..26676d66c2 100644
--- a/cub/benchmarks/bench/scan/exclusive/by_key.cu
+++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu
@@ -77,7 +77,7 @@ static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT
 {
   using init_value_t    = ValueT;
   using op_t            = cub::Sum;
-  using accum_t         = cub::detail::accumulator_t<op_t, init_value_t, ValueT>;
+  using accum_t         = ::cuda::std::__accumulator_t<op_t, ValueT, init_value_t>;
   using key_input_it_t  = const KeyT*;
   using val_input_it_t  = const ValueT*;
   using val_output_it_t = ValueT*;
diff --git a/cub/benchmarks/bench/select/unique.cu b/cub/benchmarks/bench/select/unique.cu
new file mode 100644
index 0000000000..02d2bc2ced
--- /dev/null
+++ b/cub/benchmarks/bench/select/unique.cu
@@ -0,0 +1,150 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cub/device/device_select.cuh>
+
+#include <limits>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+constexpr bool keep_rejects = false;
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+template <typename InputT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
+
+    static constexpr int ITEMS_PER_THREAD =
+      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+
+    using SelectIfPolicyT =
+      cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
+                               ITEMS_PER_THREAD,
+                               TUNE_LOAD_ALGORITHM,
+                               TUNE_LOAD_MODIFIER,
+                               cub::BLOCK_SCAN_WARP_SCANS,
+                               delay_constructor_t>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <typename T, typename OffsetT, typename InPlaceAlgT>
+static void unique(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlaceAlgT>)
+{
+  using input_it_t         = const T*;
+  using flag_it_t          = cub::NullType*;
+  using output_it_t        = T*;
+  using num_selected_it_t  = OffsetT*;
+  using select_op_t        = cub::NullType;
+  using equality_op_t      = cub::Equality;
+  using offset_t           = OffsetT;
+  constexpr bool may_alias = InPlaceAlgT::value;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<T>;
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias,
+    policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  constexpr std::size_t min_segment_size = 1;
+  const std::size_t max_segment_size     = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
+
+  thrust::device_vector<T> in = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
+  thrust::device_vector<T> out(elements);
+  thrust::device_vector<offset_t> num_unique_out(1);
+
+  input_it_t d_in                = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out              = thrust::raw_pointer_cast(out.data());
+  flag_it_t d_flags              = nullptr;
+  num_selected_it_t d_num_unique = thrust::raw_pointer_cast(num_unique_out.data());
+
+  // Get temporary storage requirements
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_flags, d_out, d_num_unique, select_op_t{}, equality_op_t{}, elements, 0);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  // Get number of unique elements
+  dispatch_t::Dispatch(
+    temp_storage, temp_size, d_in, d_flags, d_out, d_num_unique, select_op_t{}, equality_op_t{}, elements, 0);
+
+  cudaDeviceSynchronize();
+  const OffsetT num_unique = num_unique_out[0];
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(num_unique);
+  state.add_global_memory_writes<offset_t>(1);
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_flags,
+      d_out,
+      d_num_unique,
+      select_op_t{},
+      equality_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using in_place_alg = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(unique, NVBENCH_TYPE_AXES(fundamental_types, offset_types, in_place_alg))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "IsInPlace{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});
diff --git a/cub/benchmarks/bench/transform/babelstream.h b/cub/benchmarks/bench/transform/babelstream.h
new file mode 100644
index 0000000000..0f482d59e2
--- /dev/null
+++ b/cub/benchmarks/bench/transform/babelstream.h
@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cub/device/dispatch/dispatch_transform.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <stdexcept>
+
+#include <nvbench_helper.cuh>
+
+template <typename... RandomAccessIteratorsIn>
+#if TUNE_BASE
+using policy_hub_t = cub::detail::transform::policy_hub<false, ::cuda::std::tuple<RandomAccessIteratorsIn...>>;
+#else
+struct policy_hub_t
+{
+  struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy>
+  {
+    static constexpr int min_bif    = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
+    static constexpr auto algorithm = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);
+    using algo_policy =
+      ::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::fallback_for,
+                       cub::detail::transform::fallback_for_policy,
+                       cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>;
+  };
+};
+#endif
+
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types =
+  nvbench::type_list<std::int8_t,
+                     std::int16_t,
+                     float,
+                     double
+#  ifdef NVBENCH_HELPER_HAS_I128
+                     ,
+                     __int128
+#  endif
+                     >;
+#endif
+
+// BabelStream uses 2^25, H200 can fit 2^31 int128s
+// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80)
+// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108)
+// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132)
+// inline auto array_size_powers = std::vector<nvbench::int64_t>{28};
+inline auto array_size_powers = nvbench::range(16, 28, 4);
+
+template <typename OffsetT,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename ExecTag = decltype(nvbench::exec_tag::no_batch)>
+void bench_transform(
+  nvbench::state& state,
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+  RandomAccessIteratorOut output,
+  OffsetT num_items,
+  TransformOp transform_op,
+  ExecTag exec_tag = nvbench::exec_tag::no_batch)
+{
+  state.exec(exec_tag, [&](const nvbench::launch& launch) {
+    cub::detail::transform::dispatch_t<
+      false,
+      OffsetT,
+      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+      RandomAccessIteratorOut,
+      TransformOp,
+      policy_hub_t<RandomAccessIteratorsIn...>>::dispatch(inputs, output, num_items, transform_op, launch.get_stream());
+  });
+}
+
+// Modified from BabelStream to also work for integers
+inline constexpr auto startA      = 1; // BabelStream: 0.1
+inline constexpr auto startB      = 2; // BabelStream: 0.2
+inline constexpr auto startC      = 3; // BabelStream: 0.1
+inline constexpr auto startScalar = 4; // BabelStream: 0.4
+
+// TODO(bgruber): we should put those somewhere into libcu++:
+// from C++ GSL
+struct narrowing_error : std::runtime_error
+{
+  narrowing_error()
+      : std::runtime_error("Narrowing error")
+  {}
+};
+
+// from C++ GSL
+// implementation insipired by: https://github.com/microsoft/GSL/blob/main/include/gsl/narrow
+template <typename DstT, typename SrcT, ::cuda::std::__enable_if_t<::cuda::std::is_arithmetic<SrcT>::value, int> = 0>
+constexpr DstT narrow(SrcT value)
+{
+  constexpr bool is_different_signedness = ::cuda::std::is_signed<SrcT>::value != ::cuda::std::is_signed<DstT>::value;
+  const auto converted                   = static_cast<DstT>(value);
+  if (static_cast<SrcT>(converted) != value || (is_different_signedness && ((converted < DstT{}) != (value < SrcT{}))))
+  {
+    throw narrowing_error{};
+  }
+  return converted;
+}
diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu
new file mode 100644
index 0000000000..87abdfef6f
--- /dev/null
+++ b/cub/benchmarks/bench/transform/babelstream1.cu
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+
+  const T scalar = startScalar;
+  bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] _CCCL_DEVICE(const T& ci) {
+    return ci * scalar;
+  });
+}
+
+NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("mul")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu
new file mode 100644
index 0000000000..c8fa017b78
--- /dev/null
+++ b/cub/benchmarks/bench/transform/babelstream2.cu
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(
+    state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
+      return ai + bi;
+    });
+}
+
+NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("add")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
+
+template <typename T, typename OffsetT>
+static void triad(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  const T scalar = startScalar;
+  bench_transform(
+    state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) {
+      return bi + scalar * ci;
+    });
+}
+
+NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("triad")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu
new file mode 100644
index 0000000000..db54155421
--- /dev/null
+++ b/cub/benchmarks/bench/transform/babelstream3.cu
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n         = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  const auto overwrite = static_cast<bool>(state.get_int64("OverwriteInput"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  // The BabelStream nstream overwrites one input array to avoid write-allocation of cache lines. However, this changes
+  // the data that is computed for each iteration and results in an unstable workload. Therefore, we added an axis to
+  // choose a different output array. Pass `-a OverwriteInput=0` to the benchmark to disable overwriting the input.
+  thrust::device_vector<T> d;
+  if (!overwrite)
+  {
+    d.resize(n);
+  }
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n);
+  state.add_global_memory_writes<T>(n);
+  const T scalar = startScalar;
+  bench_transform(
+    state,
+    ::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
+    overwrite ? a.begin() : d.begin(),
+    n,
+    [=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
+      return ai + bi + scalar * ci;
+    },
+    nvbench::exec_tag::none); // Use batch mode for benchmarking since the workload changes. Not necessary when
+                              // OverwriteInput=0, but doesn't hurt
+}
+
+NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("nstream")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers)
+  .add_int64_axis("OverwriteInput", {1});
diff --git a/cub/cmake/header_test.in b/cub/cmake/header_test.in
index 547c2030ab..300fa6abb9 100644
--- a/cub/cmake/header_test.in
+++ b/cub/cmake/header_test.in
@@ -5,7 +5,7 @@
 // Define CUB_MACRO_CHECK(macro, header), which emits a diagnostic indicating
 // a potential macro collision and halts.
 //
-// Use raw platform checks instead of the CUB_HOST_COMPILER macros since we
+// Use raw platform macros instead of the CCCL macros since we
 // don't want to #include any headers other than the one being tested.
 //
 // This is only implemented for MSVC/GCC/Clang.
diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index ce204273da..f324de52bc 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -287,22 +287,22 @@ struct AgentHistogram
   SampleT* d_native_samples;
 
   /// The number of output bins for each channel
-  int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+  int* num_output_bins;
 
   /// The number of privatized bins for each channel
-  int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+  int* num_privatized_bins;
 
-  /// Reference to gmem privatized histograms for each channel
+  /// Copy of gmem privatized histograms for each channel
   CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
 
   /// Reference to final output histograms (gmem)
-  CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+  CounterT** d_output_histograms;
 
   /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-  OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+  OutputDecodeOpT* output_decode_op;
 
   /// The transform operator for determining privatized counter indices from samples, one for each channel
-  PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+  PrivatizedDecodeOpT* privatized_decode_op;
 
   /// Whether to prefer privatized smem counters vs privatized global counters
   bool prefer_smem;
@@ -810,12 +810,12 @@ struct AgentHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram(
     TempStorage& temp_storage,
     SampleIteratorT d_samples,
-    int (&num_output_bins)[NUM_ACTIVE_CHANNELS],
-    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],
-    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS],
-    CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],
-    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS],
-    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])
+    int* num_output_bins,
+    int* num_privatized_bins,
+    CounterT** d_output_histograms,
+    CounterT** d_privatized_histograms,
+    OutputDecodeOpT* output_decode_op,
+    PrivatizedDecodeOpT* privatized_decode_op)
       : temp_storage(temp_storage.Alias())
       , d_wrapped_samples(d_samples)
       , d_native_samples(NativePointer(d_wrapped_samples))
diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh
index 709e9c1bd0..5bc3bae321 100644
--- a/cub/cub/block/block_adjacent_difference.cuh
+++ b/cub/cub/block/block_adjacent_difference.cuh
@@ -26,8 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file The cub::BlockAdjacentDifference class provides collective methods for computing
-//! the differences of adjacent elements partitioned across a CUDA thread block.
+//! @file
+//! The cub::BlockAdjacentDifference class provides collective methods for computing the differences of adjacent
+//! elements partitioned across a CUDA thread block.
 
 #pragma once
 
diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
index 76c073f1b5..284ac4401e 100644
--- a/cub/cub/block/block_load.cuh
+++ b/cub/cub/block/block_load.cuh
@@ -26,7 +26,8 @@
  *
  ******************************************************************************/
 
-//! @file block_load.cuh Operations for reading linear tiles of data into the CUDA thread block.
+//! @file
+//! block_load.cuh Operations for reading linear tiles of data into the CUDA thread block.
 
 #pragma once
 
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index c91731ae03..21a4879192 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -26,7 +26,8 @@
  *
  ******************************************************************************/
 
-//! @file cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+//! @file
+//! cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
 
 #pragma once
 
diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh
index d35c90c06d..12c97ee5b8 100644
--- a/cub/cub/block/block_reduce.cuh
+++ b/cub/cub/block/block_reduce.cuh
@@ -26,8 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file The cub::BlockReduce class provides :ref:`collective <collective-primitives>` methods for computing
-//!       a parallel reduction of items partitioned across a CUDA thread block.
+//! @file
+//! The cub::BlockReduce class provides :ref:`collective <collective-primitives>` methods for computing a parallel
+//! reduction of items partitioned across a CUDA thread block.
 
 #pragma once
 
diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index df7ab6e814..afc4df76d7 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -26,8 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a
-//!       parallel prefix sum/scan of items partitioned across a CUDA thread block.
+//! @file
+//! The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
+//! sum/scan of items partitioned across a CUDA thread block.
 
 #pragma once
 
diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh
index 048c6e3a8e..a3dedcc3c7 100644
--- a/cub/cub/block/block_shuffle.cuh
+++ b/cub/cub/block/block_shuffle.cuh
@@ -26,8 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file The cub::BlockShuffle class provides :ref:`collective <collective-primitives>` methods for shuffling
-//!       data partitioned across a CUDA thread block.
+//! @file
+//! The cub::BlockShuffle class provides :ref:`collective <collective-primitives>` methods for shuffling data
+//! partitioned across a CUDA thread block.
 
 #pragma once
 
diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh
index 6c9f4f57a8..9d057d7fe4 100644
--- a/cub/cub/block/block_store.cuh
+++ b/cub/cub/block/block_store.cuh
@@ -26,7 +26,8 @@
  *
  ******************************************************************************/
 
-//! @file Operations for writing linear segments of data from the CUDA thread block
+//! @file
+//! Operations for writing linear segments of data from the CUDA thread block
 
 #pragma once
 
diff --git a/cub/cub/cmake/cub-config-version.cmake b/cub/cub/cmake/cub-config-version.cmake
index 2a12c4fa2b..86cdca2275 100644
--- a/cub/cub/cmake/cub-config-version.cmake
+++ b/cub/cub/cmake/cub-config-version.cmake
@@ -2,7 +2,7 @@
 include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake")
 
 set(CUB_VERSION_MAJOR 2)
-set(CUB_VERSION_MINOR 6)
+set(CUB_VERSION_MINOR 7)
 set(CUB_VERSION_PATCH 0)
 set(CUB_VERSION_TWEAK 0)
 set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}")
diff --git a/cub/cub/config.cuh b/cub/cub/config.cuh
index f7f25ddef0..123f2df46b 100644
--- a/cub/cub/config.cuh
+++ b/cub/cub/config.cuh
@@ -33,7 +33,7 @@
 #pragma once
 
 // For _CCCL_IMPLICIT_SYSTEM_HEADER
-#include <cuda/__cccl_config>
+#include <cuda/__cccl_config> // IWYU pragma: export
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -43,9 +43,9 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/util_arch.cuh>
-#include <cub/util_compiler.cuh>
-#include <cub/util_cpp_dialect.cuh>
-#include <cub/util_deprecated.cuh>
-#include <cub/util_macro.cuh>
-#include <cub/util_namespace.cuh>
+#include <cub/util_arch.cuh> // IWYU pragma: export
+#include <cub/util_compiler.cuh> // IWYU pragma: export
+#include <cub/util_cpp_dialect.cuh> // IWYU pragma: export
+#include <cub/util_deprecated.cuh> // IWYU pragma: export
+#include <cub/util_macro.cuh> // IWYU pragma: export
+#include <cub/util_namespace.cuh> // IWYU pragma: export
diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh
index f02ae6c002..2c4d6dd5f4 100644
--- a/cub/cub/cub.cuh
+++ b/cub/cub/cub.cuh
@@ -76,6 +76,7 @@
 #include <cub/device/device_segmented_sort.cuh>
 #include <cub/device/device_select.cuh>
 #include <cub/device/device_spmv.cuh>
+#include <cub/device/device_transform.cuh>
 
 // Grid
 // #include <cub/grid/grid_barrier.cuh>
diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh
index 10d40cacd1..12dce69c13 100644
--- a/cub/cub/detail/type_traits.cuh
+++ b/cub/cub/detail/type_traits.cuh
@@ -50,6 +50,8 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH
 _CCCL_SUPPRESS_DEPRECATED_POP
 #include <cuda/std/type_traits>
 
+#define _CUB_TEMPLATE_REQUIRES(...) ::cuda::std::__enable_if_t<(__VA_ARGS__)>* = nullptr
+
 CUB_NAMESPACE_BEGIN
 namespace detail
 {
@@ -62,9 +64,101 @@ using invoke_result_t =
   ::cuda::std::invoke_result_t<Invokable, Args...>;
 #endif
 
-/// The type of intermediate accumulator (according to P2322R6)
-template <typename Invokable, typename InitT, typename InputT>
-using accumulator_t = typename ::cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool are_same()
+{
+  return ::cuda::std::conjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_one_of()
+{
+  return ::cuda::std::disjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename...>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool always_false()
+{
+  return false;
+}
+
+template <typename T, typename V, typename = void>
+struct has_binary_call_operator : ::cuda::std::false_type
+{};
+
+template <typename T, typename V>
+struct has_binary_call_operator<
+  T,
+  V,
+  ::cuda::std::void_t<decltype(::cuda::std::declval<T>()(::cuda::std::declval<V>(), ::cuda::std::declval<V>()))>>
+    : ::cuda::std::true_type
+{};
+
+/***********************************************************************************************************************
+ * Array like type traits
+ **********************************************************************************************************************/
+
+template <typename T, typename = void>
+struct has_subscript : ::cuda::std::false_type
+{};
+
+template <typename T>
+struct has_subscript<T, ::cuda::std::void_t<decltype(::cuda::std::declval<T>()[0])>> : ::cuda::std::true_type
+{};
+
+template <typename T>
+using has_subscript_t = typename has_subscript<T>::type;
+
+template <typename T, typename = void>
+struct has_size : ::cuda::std::false_type
+{};
+
+// TODO: use ::cuda::std::size(::cuda::std::declval<T>()) when std::size will be available in libcu++
+template <typename T>
+struct has_size<T, ::cuda::std::void_t<decltype(::cuda::std::declval<T>().size())>> : ::cuda::std::true_type
+{};
+
+template <typename T, ::cuda::std::size_t N>
+struct has_size<T[N], void> : ::cuda::std::true_type
+{};
+
+template <typename T>
+using has_size_t = typename has_size<T>::type;
+
+/***********************************************************************************************************************
+ * StaticSize: a type trait that returns the number of elements in an Array-like type
+ **********************************************************************************************************************/
+// StaticSize is useful where size(obj) cannot be checked at compile time
+// e.g.
+// using Array = NonTriviallyConstructible[8];
+// std::size(Array{})   // compile error
+// static_size<Array>() // ok
+
+template <typename T, typename = void>
+struct StaticSize
+{
+  static_assert(detail::always_false<T>(), "StaticSize not supported for this type");
+};
+
+template <typename T>
+struct StaticSize<T,
+                  ::cuda::std::void_t<decltype(::cuda::std::integral_constant<int, ::cuda::std::declval<T>().size()>{})>>
+{
+  static_assert(::cuda::std::is_trivially_constructible<T>::value, "T must be trivially constructible");
+  static constexpr auto value = T{}.size();
+};
+
+template <typename T, ::cuda::std::size_t N>
+struct StaticSize<T[N], void>
+{
+  static constexpr auto value = N;
+};
+
+template <typename T>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::size_t static_size()
+{
+  return StaticSize<T>::value;
+}
 
 } // namespace detail
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh
index 0d222475b2..a6d24a5221 100644
--- a/cub/cub/device/device_copy.cuh
+++ b/cub/cub/device/device_copy.cuh
@@ -25,7 +25,8 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceCopy provides device-wide, parallel operations for copying data.
+//! @file
+//! cub::DeviceCopy provides device-wide, parallel operations for copying data.
 
 #pragma once
 
diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh
index 46f4bee557..e6abc4bd07 100644
--- a/cub/cub/device/device_histogram.cuh
+++ b/cub/cub/device/device_histogram.cuh
@@ -26,9 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceHistogram provides device-wide parallel operations for
-//!       constructing histogram(s) from a sequence of samples data residing
-//!       within device-accessible memory.
+//! @file
+//! cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of
+//! samples data residing within device-accessible memory.
 
 #pragma once
 
diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh
index 1359863a76..e71431cb74 100644
--- a/cub/cub/device/device_memcpy.cuh
+++ b/cub/cub/device/device_memcpy.cuh
@@ -25,7 +25,8 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
+//! @file
+//! cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
 
 #pragma once
 
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index 08a2ae531f..28bfc377bd 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -26,8 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DevicePartition provides device-wide, parallel operations for
-//!       partitioning sequences of data items residing within device-accessible memory.
+//! @file
+//! cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing
+//! within device-accessible memory.
 
 #pragma once
 
diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh
index c653badc47..a14c5e4364 100644
--- a/cub/cub/device/device_radix_sort.cuh
+++ b/cub/cub/device/device_radix_sort.cuh
@@ -26,9 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceRadixSort provides device-wide, parallel operations for
-//!       computing a radix sort across a sequence of data items residing within
-//!       device-accessible memory.
+//! @file
+//! cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data
+//! items residing within device-accessible memory.
 
 #pragma once
 
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index a31e641920..4b02129123 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -26,9 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceReduce provides device-wide, parallel operations for
-//!       computing a reduction across a sequence of data items residing within
-//!       device-accessible memory.
+//! @file
+//! cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data
+//! items residing within device-accessible memory.
 
 #pragma once
 
diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index f3b1a3e669..120562a461 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -26,14 +26,16 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceRunLengthEncode provides device-wide, parallel operations
-//!       for computing a run-length encoding across a sequence of data items
-//!       residing within device-accessible memory.
+//! @file
+//! cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a
+//! sequence of data items residing within device-accessible memory.
 
 #pragma once
 
 #include <cub/config.cuh>
 
+#include <cuda/std/__functional/invoke.h>
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -200,7 +202,7 @@ struct DeviceRunLengthEncode
     // Generator type for providing 1s values for run-length reduction
     using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;
 
-    using accum_t = detail::accumulator_t<reduction_op, length_t, length_t>;
+    using accum_t = ::cuda::std::__accumulator_t<reduction_op, length_t, length_t>;
 
     using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;
 
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index 29f3cf6c1e..27882e9cee 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -26,8 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across
-//!       a sequence of data items residing within device-accessible memory.
+//! @file
+//! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
+//! items residing within device-accessible memory.
 
 #pragma once
 
@@ -41,12 +42,15 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/choose_offset.cuh>
 #include <cub/detail/nvtx.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/dispatch_scan_by_key.cuh>
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_deprecated.cuh>
 
+#include <cuda/std/__functional/invoke.h>
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
@@ -152,6 +156,9 @@ struct DeviceScan
   //! @tparam OutputIteratorT
   //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -172,19 +179,19 @@ struct DeviceScan
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename OutputIteratorT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
     using InitT   = cub::detail::value_t<InputIteratorT>;
 
     // Initial value
@@ -195,13 +202,13 @@ struct DeviceScan
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -261,6 +268,9 @@ struct DeviceScan
   //! @tparam IteratorT
   //!   **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -278,20 +288,20 @@ struct DeviceScan
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename IteratorT>
+  template <typename IteratorT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
-    void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0)
+    void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
   {
     return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename IteratorT>
+  template <typename IteratorT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     IteratorT d_data,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -377,6 +387,9 @@ struct DeviceScan
   //!  **[inferred]** Type of the `init_value` used Binary scan functor type
   //!   having member `T operator()(const T &a, const T &b)`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -403,7 +416,7 @@ struct DeviceScan
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -411,13 +424,13 @@ struct DeviceScan
     OutputIteratorT d_out,
     ScanOpT scan_op,
     InitValueT init_value,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
 
     return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
       d_temp_storage,
@@ -431,7 +444,7 @@ struct DeviceScan
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -439,7 +452,7 @@ struct DeviceScan
     OutputIteratorT d_out,
     ScanOpT scan_op,
     InitValueT init_value,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -520,6 +533,9 @@ struct DeviceScan
   //!  **[inferred]** Type of the `init_value` used Binary scan functor type
   //!   having member `T operator()(const T &a, const T &b)`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -543,28 +559,28 @@ struct DeviceScan
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename IteratorT, typename ScanOpT, typename InitValueT>
+  template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     IteratorT d_data,
     ScanOpT scan_op,
     InitValueT init_value,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename IteratorT, typename ScanOpT, typename InitValueT>
+  template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     IteratorT d_data,
     ScanOpT scan_op,
     InitValueT init_value,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -655,6 +671,9 @@ struct DeviceScan
   //!  **[inferred]** Type of the `init_value` used Binary scan functor type
   //!   having member `T operator()(const T &a, const T &b)`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -685,7 +704,8 @@ struct DeviceScan
             typename OutputIteratorT,
             typename ScanOpT,
             typename InitValueT,
-            typename InitValueIterT = InitValueT*>
+            typename InitValueIterT = InitValueT*,
+            typename NumItemsT      = int>
   CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -693,13 +713,13 @@ struct DeviceScan
     OutputIteratorT d_out,
     ScanOpT scan_op,
     FutureValue<InitValueT, InitValueIterT> init_value,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
 
     return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
       d_temp_storage,
@@ -717,7 +737,8 @@ struct DeviceScan
             typename OutputIteratorT,
             typename ScanOpT,
             typename InitValueT,
-            typename InitValueIterT = InitValueT*>
+            typename InitValueIterT = InitValueT*,
+            typename NumItemsT      = int>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -725,7 +746,7 @@ struct DeviceScan
     OutputIteratorT d_out,
     ScanOpT scan_op,
     FutureValue<InitValueT, InitValueIterT> init_value,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -809,6 +830,9 @@ struct DeviceScan
   //!  **[inferred]** Type of the `init_value` used Binary scan functor type
   //!   having member `T operator()(const T &a, const T &b)`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -832,28 +856,36 @@ struct DeviceScan
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename IteratorT, typename ScanOpT, typename InitValueT, typename InitValueIterT = InitValueT*>
+  template <typename IteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename InitValueIterT = InitValueT*,
+            typename NumItemsT      = int>
   CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     IteratorT d_data,
     ScanOpT scan_op,
     FutureValue<InitValueT, InitValueIterT> init_value,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename IteratorT, typename ScanOpT, typename InitValueT, typename InitValueIterT = InitValueT*>
+  template <typename IteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename InitValueIterT = InitValueT*,
+            typename NumItemsT      = int>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     IteratorT d_data,
     ScanOpT scan_op,
     FutureValue<InitValueT, InitValueIterT> init_value,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -923,6 +955,9 @@ struct DeviceScan
   //! @tparam OutputIteratorT
   //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -943,32 +978,32 @@ struct DeviceScan
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename OutputIteratorT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
 
     return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream);
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -1027,6 +1062,9 @@ struct DeviceScan
   //! @tparam IteratorT
   //!   **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -1044,20 +1082,20 @@ struct DeviceScan
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename IteratorT>
+  template <typename IteratorT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
-    void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0)
+    void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
   {
     return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename IteratorT>
+  template <typename IteratorT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     IteratorT d_data,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -1137,6 +1175,9 @@ struct DeviceScan
   //! @tparam ScanOp
   //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in]
   //!   d_temp_storage Device-accessible allocation of temporary storage.
   //!   When `nullptr`, the required allocation size is written to
@@ -1161,20 +1202,20 @@ struct DeviceScan
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
     ScanOpT scan_op,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
 
     return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
@@ -1220,6 +1261,9 @@ struct DeviceScan
   //! @tparam InitValueT
   //!  **[inferred]** Type of the `init_value`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage.
   //!   When `nullptr`, the required allocation size is written to
@@ -1246,7 +1290,7 @@ struct DeviceScan
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within.
-  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
@@ -1254,14 +1298,14 @@ struct DeviceScan
     OutputIteratorT d_out,
     ScanOpT scan_op,
     InitValueT init_value,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
 
-    // Signed integer type for global offsets
-    using OffsetT = int;
-    using AccumT  = cub::detail::accumulator_t<ScanOpT, InitValueT, cub::detail::value_t<InputIteratorT>>;
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+    using AccumT  = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::value_t<InputIteratorT>, InitValueT>;
     constexpr bool ForceInclusive = true;
 
     return DispatchScan<
@@ -1283,14 +1327,14 @@ struct DeviceScan
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT>
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
     ScanOpT scan_op,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
@@ -1364,6 +1408,9 @@ struct DeviceScan
   //! @tparam ScanOp
   //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
   //! @param[in]
   //!   d_temp_storage Device-accessible allocation of temporary storage.
   //!   When `nullptr`, the required allocation size is written to
@@ -1385,26 +1432,26 @@ struct DeviceScan
   //!   @rst
   //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
   //!   @endrst
-  template <typename IteratorT, typename ScanOpT>
+  template <typename IteratorT, typename ScanOpT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     IteratorT d_data,
     ScanOpT scan_op,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream = 0)
   {
     return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
   }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-  template <typename IteratorT, typename ScanOpT>
+  template <typename IteratorT, typename ScanOpT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     IteratorT d_data,
     ScanOpT scan_op,
-    int num_items,
+    NumItemsT num_items,
     cudaStream_t stream,
     bool debug_synchronous)
   {
diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh
index eb6eecdcf3..cc627b971c 100644
--- a/cub/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/cub/device/device_segmented_radix_sort.cuh
@@ -26,8 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort
-//!       across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+//! @file
+//! cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across
+//! multiple, non-overlapping sequences of data items residing within device-accessible memory.
 
 #pragma once
 
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
index 6a0875734e..ec5d017fc2 100644
--- a/cub/cub/device/device_segmented_reduce.cuh
+++ b/cub/cub/device/device_segmented_reduce.cuh
@@ -26,9 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceSegmentedReduce provides device-wide, parallel operations
-//!       for computing a batched reduction across multiple sequences of data
-//!       items residing within device-accessible memory.
+//! @file
+//! cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across
+//! multiple sequences of data items residing within device-accessible memory.
 
 #pragma once
 
diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh
index 67a22c5e54..7d01b6d56a 100644
--- a/cub/cub/device/device_segmented_sort.cuh
+++ b/cub/cub/device/device_segmented_sort.cuh
@@ -25,9 +25,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceSegmentedSort provides device-wide, parallel operations for
-//!       computing a batched sort across multiple, non-overlapping sequences of
-//!       data items residing within device-accessible memory.
+//! @file
+//! cub::DeviceSegmentedSort provides device-wide, parallel operations for computing a batched sort across multiple,
+//! non-overlapping sequences of data items residing within device-accessible memory.
 
 #pragma once
 
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 703a912829..332bbe6c7d 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -26,9 +26,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceSelect provides device-wide, parallel operations for
-//!       compacting selected items from sequences of data items residing within
-//!       device-accessible memory.
+//! @file
+//! cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data
+//! items residing within device-accessible memory.
 
 #pragma once
 
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
index 32ac433f3e..8b7e60d435 100644
--- a/cub/cub/device/device_spmv.cuh
+++ b/cub/cub/device/device_spmv.cuh
@@ -27,8 +27,9 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceSpmv provides device-wide parallel operations for performing
-//!       sparse-matrix * vector multiplication (SpMV).
+//! @file
+//! cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication
+//! (SpMV).
 
 #pragma once
 
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
new file mode 100644
index 0000000000..984109692f
--- /dev/null
+++ b/cub/cub/device/device_transform.cuh
@@ -0,0 +1,271 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_transform.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda/std/tuple>
+
+CUB_NAMESPACE_BEGIN
+
+//! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
+//! sequences into an output sequence.
+struct DeviceTransform
+{
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many
+  //!     :end-before: example-end transform-many
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
+    return detail::transform::
+      dispatch_t<false, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Transform(
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    return Transform(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Transform(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many-stable
+  //!     :end-before: example-end transform-many-stable
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
+    return detail::transform::
+      dispatch_t<true, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return TransformStableArgumentAddresses(
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    return TransformStableArgumentAddresses(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return TransformStableArgumentAddresses(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 7a657d6703..4af6864b03 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -38,6 +38,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_for.cuh>
+#include <cub/device/dispatch/kernels/for_each.cuh>
 #include <cub/device/dispatch/tuning/tuning_for.cuh>
 #include <cub/thread/thread_load.cuh>
 #include <cub/util_device.cuh>
@@ -56,106 +57,6 @@ namespace detail
 namespace for_each
 {
 
-template <class Fn>
-struct first_parameter
-{
-  using type = void;
-};
-
-template <class C, class R, class A>
-struct first_parameter<R (C::*)(A)>
-{
-  using type = A;
-};
-
-template <class C, class R, class A>
-struct first_parameter<R (C::*)(A) const>
-{
-  using type = A;
-};
-
-template <class Fn>
-using first_parameter_t = typename first_parameter<decltype(&Fn::operator())>::type;
-
-template <class Value, class Fn, class = void>
-struct has_unique_value_overload : ::cuda::std::false_type
-{};
-
-// clang-format off
-template <class Value, class Fn>
-struct has_unique_value_overload<
-  Value,
-  Fn,
-  typename ::cuda::std::enable_if<
-              !::cuda::std::is_reference<first_parameter_t<Fn>>::value &&
-              ::cuda::std::is_convertible<Value, first_parameter_t<Fn>
-             >::value>::type>
-    : ::cuda::std::true_type
-{};
-
-// For trivial types, foreach is not allowed to copy values, even if those are trivially copyable.
-// This can be observable if the unary operator takes parameter by reference and modifies it or uses address.
-// The trait below checks if the freedom to copy trivial types can be regained.
-template <typename Value, typename Fn>
-using can_regain_copy_freedom =
-  ::cuda::std::integral_constant<
-    bool,
-    ::cuda::std::is_trivially_constructible<Value>::value &&
-    ::cuda::std::is_trivially_copy_assignable<Value>::value &&
-    :: cuda::std::is_trivially_move_assignable<Value>::value &&
-    ::cuda::std::is_trivially_destructible<Value>::value &&
-    has_unique_value_overload<Value, Fn>::value>;
-// clang-format on
-
-// This kernel is used when the block size is not known at compile time
-template <class ChainedPolicyT, class OffsetT, class OpT>
-CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
-{
-  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
-  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
-
-  const auto block_threads  = static_cast<OffsetT>(blockDim.x);
-  const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
-  const auto tile_base      = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
-  const auto num_remaining  = num_items - tile_base;
-  const auto items_in_tile  = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
-
-  if (items_in_tile == items_per_tile)
-  {
-    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
-  }
-  else
-  {
-    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
-  }
-}
-
-// This kernel is used when the block size is known at compile time
-template <class ChainedPolicyT, class OffsetT, class OpT>
-CUB_DETAIL_KERNEL_ATTRIBUTES //
-__launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
-  void static_kernel(OffsetT num_items, OpT op)
-{
-  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
-  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
-
-  constexpr auto block_threads  = active_policy_t::block_threads;
-  constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads;
-
-  const auto tile_base     = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
-  const auto num_remaining = num_items - tile_base;
-  const auto items_in_tile = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
-
-  if (items_in_tile == items_per_tile)
-  {
-    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
-  }
-  else
-  {
-    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
-  }
-}
-
 // The dispatch layer is in the detail namespace until we figure out tuning API
 template <class OffsetT, class OpT, class PolicyHubT = policy_hub_t>
 struct dispatch_t : PolicyHubT
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index aa8cc2f5c0..1839385b19 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -238,12 +238,12 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK
   AgentHistogramT agent(
     temp_storage,
     d_samples,
-    num_output_bins_wrapper.__elems_,
-    num_privatized_bins_wrapper.__elems_,
-    d_output_histograms_wrapper.__elems_,
-    d_privatized_histograms_wrapper.__elems_,
-    output_decode_op_wrapper.__elems_,
-    privatized_decode_op_wrapper.__elems_);
+    num_output_bins_wrapper.data(),
+    num_privatized_bins_wrapper.data(),
+    d_output_histograms_wrapper.data(),
+    d_privatized_histograms_wrapper.data(),
+    output_decode_op_wrapper.data(),
+    privatized_decode_op_wrapper.data());
 
   // Initialize counters
   agent.InitBinCounters();
@@ -847,7 +847,7 @@ public:
   {
 // GCC 14 rightfully warns that when a value-initialized array of this struct is copied using memcpy, uninitialized
 // bytes may be accessed. To avoid this, we add a dummy member, so value initialization actually initializes the memory.
-#if defined(_CCCL_COMPILER_GCC) && __GNUC__ == 14
+#if defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 13
     char dummy;
 #endif
 
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index e3e3844a3f..ba5365c618 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -316,7 +316,7 @@ template <typename InputIteratorT,
           typename OffsetT,
           typename ReductionOpT,
           typename InitT  = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
-          typename AccumT = detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
           typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
           typename TransformOpT   = ::cuda::std::__identity>
 struct DispatchReduce : SelectedPolicy
@@ -797,17 +797,16 @@ struct DispatchReduce : SelectedPolicy
  * @tparam InitT
  *   Initial value type
  */
-template <typename InputIteratorT,
-          typename OutputIteratorT,
-          typename OffsetT,
-          typename ReductionOpT,
-          typename TransformOpT,
-          typename InitT,
-          typename AccumT =
-            detail::accumulator_t<ReductionOpT, //
-                                  InitT,
-                                  cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>>,
-          typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
+template <
+  typename InputIteratorT,
+  typename OutputIteratorT,
+  typename OffsetT,
+  typename ReductionOpT,
+  typename TransformOpT,
+  typename InitT,
+  typename AccumT = ::cuda::std::
+    __accumulator_t<ReductionOpT, cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>, InitT>,
+  typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
 using DispatchTransformReduce =
   DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, SelectedPolicyT, TransformOpT>;
 
@@ -850,7 +849,7 @@ template <typename InputIteratorT,
           typename OffsetT,
           typename ReductionOpT,
           typename InitT  = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
-          typename AccumT = detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
           typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
 struct DispatchSegmentedReduce : SelectedPolicy
 {
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 07dd492a53..8ae232e8d1 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -213,25 +213,25 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
  *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
-template <
-  typename KeysInputIteratorT,
-  typename UniqueOutputIteratorT,
-  typename ValuesInputIteratorT,
-  typename AggregatesOutputIteratorT,
-  typename NumRunsOutputIteratorT,
-  typename EqualityOpT,
-  typename ReductionOpT,
-  typename OffsetT,
-  typename AccumT = //
-  detail::
-    accumulator_t<ReductionOpT, cub::detail::value_t<ValuesInputIteratorT>, cub::detail::value_t<ValuesInputIteratorT>>,
-  typename SelectedPolicy = //
-  detail::device_reduce_by_key_policy_hub< //
-    ReductionOpT, //
-    AccumT, //
-    cub::detail::non_void_value_t< //
-      UniqueOutputIteratorT, //
-      cub::detail::value_t<KeysInputIteratorT>>>>
+template <typename KeysInputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename ValuesInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename NumRunsOutputIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT,
+          typename AccumT = //
+          ::cuda::std::__accumulator_t<ReductionOpT,
+                                       cub::detail::value_t<ValuesInputIteratorT>,
+                                       cub::detail::value_t<ValuesInputIteratorT>>,
+          typename SelectedPolicy = //
+          detail::device_reduce_by_key_policy_hub< //
+            ReductionOpT, //
+            AccumT, //
+            cub::detail::non_void_value_t< //
+              UniqueOutputIteratorT, //
+              cub::detail::value_t<KeysInputIteratorT>>>>
 struct DispatchReduceByKey
 {
   //-------------------------------------------------------------------------
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 56c2be9611..7d2fc4ac17 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -234,11 +234,11 @@ template <typename InputIteratorT,
           typename ScanOpT,
           typename InitValueT,
           typename OffsetT,
-          typename AccumT         = detail::accumulator_t<ScanOpT,
-                                                          ::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
-                                                                           cub::detail::value_t<InputIteratorT>,
-                                                                           typename InitValueT::value_type>,
-                                                          cub::detail::value_t<InputIteratorT>>,
+          typename AccumT         = ::cuda::std::__accumulator_t<ScanOpT,
+                                                                 cub::detail::value_t<InputIteratorT>,
+                                                                 ::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
+                                                                                  cub::detail::value_t<InputIteratorT>,
+                                                                                  typename InitValueT::value_type>>,
           typename SelectedPolicy = DeviceScanPolicy<AccumT, ScanOpT>,
           bool ForceInclusive     = false>
 struct DispatchScan : SelectedPolicy
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index 032554773a..5dfffa5e77 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -228,10 +228,10 @@ template <
   typename ScanOpT,
   typename InitValueT,
   typename OffsetT,
-  typename AccumT = detail::accumulator_t<
+  typename AccumT = ::cuda::std::__accumulator_t<
     ScanOpT,
-    ::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>,
-    cub::detail::value_t<ValuesInputIteratorT>>,
+    cub::detail::value_t<ValuesInputIteratorT>,
+    ::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>>,
   typename SelectedPolicy =
     DeviceScanByKeyPolicy<KeysInputIteratorT, AccumT, cub::detail::value_t<ValuesInputIteratorT>, ScanOpT>>
 struct DispatchScanByKey : SelectedPolicy
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
new file mode 100644
index 0000000000..8fb596da07
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -0,0 +1,866 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000
+_CCCL_NV_DIAG_SUPPRESS(186)
+#  include <cuda_pipeline_primitives.h>
+// we cannot re-enable the warning here, because it is triggered outside the translation unit
+// see also: https://godbolt.org/z/1x8b4hn3G
+#endif // defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000
+
+#include <cub/agent/agent_for.cuh>
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/device/dispatch/tuning/tuning_for.cuh>
+#include <cub/util_arch.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+
+#include <cuda/cmath>
+#include <cuda/ptx>
+#include <cuda/std/__algorithm/clamp.h>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/array>
+#include <cuda/std/bit>
+#include <cuda/std/expected>
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+#include <cassert>
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
+
+CUB_NAMESPACE_BEGIN
+
+// the ublkcp kernel needs PTX features that are only available and understood by CTK 12 and later
+#if _CCCL_CUDACC_VER_MAJOR >= 12
+#  define _CUB_HAS_TRANSFORM_UBLKCP
+#endif // _CCCL_CUDACC_VER_MAJOR >= 12
+
+namespace detail
+{
+namespace transform
+{
+_CCCL_HOST_DEVICE constexpr int sum()
+{
+  return 0;
+}
+
+// TODO(bgruber): remove with C++17
+template <typename... Ts>
+_CCCL_HOST_DEVICE constexpr int sum(int head, Ts... tail)
+{
+  return head + sum(tail...);
+}
+
+#if _CCCL_STD_VER >= 2017
+template <typename... Its>
+_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
+{
+  return (int{sizeof(value_t<Its>)} + ... + 0);
+}
+#else // ^^^ C++17 ^^^ / vvv C++11 vvv
+template <typename... Its>
+_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
+{
+  return sum(int{sizeof(value_t<Its>)}...);
+}
+#endif // _CCCL_STD_VER >= 2017
+
+enum class Algorithm
+{
+  fallback_for,
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  ublkcp,
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+};
+
+// this kernel replicates the behavior of cub::DeviceFor::Bulk
+template <typename ForPolicy,
+          typename Offset,
+          typename F,
+          typename RandomAccessIteratorOut,
+          typename... RandomAccessIteratorsIn>
+_CCCL_DEVICE void transform_kernel_impl(
+  ::cuda::std::integral_constant<Algorithm, Algorithm::fallback_for>,
+  Offset num_items,
+  int /* items_per_thread */,
+  F transform_op,
+  RandomAccessIteratorOut out,
+  RandomAccessIteratorsIn... ins)
+{
+  auto op = [&](Offset i) {
+    out[i] = transform_op(ins[i]...);
+  };
+  using OpT = decltype(op);
+
+  // TODO(bgruber): verbatim copy from for_each's static_kernel below:
+  using agent_t = for_each::agent_block_striped_t<ForPolicy, Offset, OpT>;
+
+  constexpr auto block_threads  = ForPolicy::block_threads;
+  constexpr auto items_per_tile = ForPolicy::items_per_thread * block_threads;
+
+  const auto tile_base     = static_cast<Offset>(blockIdx.x) * items_per_tile;
+  const auto num_remaining = num_items - tile_base;
+  const auto items_in_tile = static_cast<Offset>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+template <int BlockThreads>
+struct async_copy_policy_t
+{
+  static constexpr int block_threads = BlockThreads;
+  // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
+  static constexpr int min_items_per_thread = 1;
+  static constexpr int max_items_per_thread = 32;
+};
+
+// TODO(bgruber) cheap copy of ::cuda::std::apply, which requires C++17.
+template <class F, class Tuple, std::size_t... Is>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::std::index_sequence<Is...>)
+  -> decltype(::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...))
+{
+  return ::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...);
+}
+
+template <class F, class Tuple>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t)
+  -> decltype(poor_apply_impl(
+    ::cuda::std::forward<F>(f),
+    ::cuda::std::forward<Tuple>(t),
+    ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t<Tuple>>::value>{}))
+{
+  return poor_apply_impl(
+    ::cuda::std::forward<F>(f),
+    ::cuda::std::forward<Tuple>(t),
+    ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t<Tuple>>::value>{});
+}
+
+// mult must be a power of 2
+template <typename Integral>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto round_up_to_po2_multiple(Integral x, Integral mult) -> Integral
+{
+#if _CCCL_STD_VER > 2011
+  _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(static_cast<::cuda::std::__make_unsigned_t<Integral>>(mult)), "");
+#endif // _CCCL_STD_VER > 2011
+  return (x + mult - 1) & ~(mult - 1);
+}
+
+template <typename T>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
+{
+#if _CCCL_STD_VER > 2011
+  _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(alignment), "");
+#endif // _CCCL_STD_VER > 2011
+  return reinterpret_cast<const char*>(
+    reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1});
+}
+
+// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding
+//
+// For performance considerations of memcpy_async:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#performance-guidance-for-memcpy-async
+//
+// We basically have to align the base pointer to 16 bytes, and copy a multiple of 16 bytes. To achieve this, when we
+// copy a tile of data from an input buffer, we round down the pointer to the start of the tile to the next lower
+// address that is a multiple of 16 bytes. This introduces head padding. We also round up the total number of bytes to
+// copy (including head padding) to a multiple of 16 bytes, which introduces tail padding. For the bulk copy kernel, we
+// have to align to 128 bytes instead of 16.
+//
+// However, padding memory copies like that may access the input buffer out-of-bounds. Here are some thoughts:
+// * According to the CUDA programming guide
+// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses), "any address of a variable
+// residing in global memory or returned by one of the memory allocation routines from the driver or runtime API is
+// always aligned to at least 256 bytes."
+// * Memory protection is usually done on memory page level, which is even larger than 256 bytes for CUDA and 4KiB on
+// Intel x86 and 4KiB+ ARM. Front and tail padding thus never leaves the memory page of the input buffer.
+// * This should count for device memory, but also for device accessible memory living on the host.
+// * The base pointer alignment and size rounding also never leaves the size of a cache line.
+//
+// Copying larger data blocks with head and tail padding should thus be legal. Nevertheless, an out-of-bounds read is
+// still technically undefined behavior in C++. Also, compute-sanitizer flags at least such reads after the end of a
+// buffer. Therefore, we lean on the safer side and protect against out of bounds reads at the beginning and end.
+
+// A note on size and alignment: The size of a type is at least as large as its alignment. We rely on this fact in some
+// conditions.
+// This is guaranteed by the C++ standard, and follows from the definition of arrays: the difference between neighboring
+// array element addresses is sizeof element type and each array element needs to fulfill the alignment requirement of
+// the element type.
+
+// Pointer with metadata to describe readonly input memory for memcpy_async and UBLKCP kernels.
+// cg::memcpy_async is most efficient when the data is 16-byte aligned and the size a multiple of 16 bytes
+// UBLKCP is most efficient when the data is 128-byte aligned and the size a multiple of 16 bytes
+template <typename T> // Cannot add alignment to signature, because we need a uniform kernel template instantiation
+struct aligned_base_ptr
+{
+  using value_type = T;
+
+  const char* ptr; // aligned pointer before the original pointer (16-byte or 128-byte). May not be aligned to
+                   // alignof(T). E.g.: array of int3 starting at address 4, ptr == 0
+  int head_padding; // byte offset between ptr and the original pointer. Value inside [0;15] or [0;127].
+
+  _CCCL_HOST_DEVICE const T* ptr_to_elements() const
+  {
+    return reinterpret_cast<const T*>(ptr + head_padding);
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator==(const aligned_base_ptr& a, const aligned_base_ptr& b)
+  {
+    return a.ptr == b.ptr && a.head_padding == b.head_padding;
+  }
+};
+
+template <typename T>
+_CCCL_HOST_DEVICE auto make_aligned_base_ptr(const T* ptr, int alignment) -> aligned_base_ptr<T>
+{
+  const char* base_ptr = round_down_ptr(ptr, alignment);
+  return aligned_base_ptr<T>{base_ptr, static_cast<int>(reinterpret_cast<const char*>(ptr) - base_ptr)};
+}
+
+constexpr int bulk_copy_alignment     = 128;
+constexpr int bulk_copy_size_multiple = 16;
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+_CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one()
+{
+  const ::cuda::std::uint32_t membermask = ~0;
+  ::cuda::std::uint32_t is_elected;
+  asm volatile(
+    "{\n\t .reg .pred P_OUT; \n\t"
+    "elect.sync _|P_OUT, %1;\n\t"
+    "selp.b32 %0, 1, 0, P_OUT; \n"
+    "}"
+    : "=r"(is_elected)
+    : "r"(membermask)
+    :);
+  return threadIdx.x < 32 && static_cast<bool>(is_elected);
+}
+
+// TODO(bgruber): inline this as lambda in C++14
+template <typename Offset, typename T>
+_CCCL_DEVICE void bulk_copy_tile(
+  ::cuda::std::uint64_t& bar,
+  int tile_stride,
+  char* smem,
+  int& smem_offset,
+  ::cuda::std::uint32_t& total_bytes_bulk_copied,
+  Offset global_offset,
+  const aligned_base_ptr<T>& aligned_ptr)
+{
+  static_assert(alignof(T) <= bulk_copy_alignment, "");
+
+  const char* src = aligned_ptr.ptr + global_offset * sizeof(T);
+  char* dst       = smem + smem_offset;
+  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
+  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
+
+  // TODO(bgruber): we could precompute bytes_to_copy on the host
+  const int bytes_to_copy = round_up_to_po2_multiple(
+    aligned_ptr.head_padding + static_cast<int>(sizeof(T)) * tile_stride, bulk_copy_size_multiple);
+
+  ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar);
+  total_bytes_bulk_copied += bytes_to_copy;
+
+  // add bulk_copy_alignment to make space for the next tile's head padding
+  smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
+}
+
+template <typename Offset, typename T>
+_CCCL_DEVICE void bulk_copy_tile_fallback(
+  int tile_size,
+  int tile_stride,
+  char* smem,
+  int& smem_offset,
+  Offset global_offset,
+  const aligned_base_ptr<T>& aligned_ptr)
+{
+  const T* src = aligned_ptr.ptr_to_elements() + global_offset;
+  T* dst       = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding);
+  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
+  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
+
+  const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size;
+  cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy);
+
+  // add bulk_copy_alignment to make space for the next tile's head padding
+  smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
+}
+
+// TODO(bgruber): inline this as lambda in C++14
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE const T&
+fetch_operand(int tile_stride, const char* smem, int& smem_offset, int smem_idx, const aligned_base_ptr<T>& aligned_ptr)
+{
+  const T* smem_operand_tile_base = reinterpret_cast<const T*>(smem + smem_offset + aligned_ptr.head_padding);
+  smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment;
+  return smem_operand_tile_base[smem_idx];
+}
+
+template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
+_CCCL_DEVICE void transform_kernel_ublkcp(
+  Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr<InTs>... aligned_ptrs)
+{
+  __shared__ uint64_t bar;
+  extern __shared__ char __align__(bulk_copy_alignment) smem[];
+
+  namespace ptx = ::cuda::ptx;
+
+  constexpr int block_dim = BulkCopyPolicy::block_threads;
+  const int tile_stride   = block_dim * num_elem_per_thread;
+  const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_stride;
+  const int tile_size     = ::cuda::std::min(num_items - offset, Offset{tile_stride});
+
+  const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x;
+  if (inner_blocks)
+  {
+    // use one thread to setup the entire bulk copy
+    if (elect_one())
+    {
+      ptx::mbarrier_init(&bar, 1);
+      ptx::fence_proxy_async(ptx::space_shared);
+
+      int smem_offset                    = 0;
+      ::cuda::std::uint32_t total_copied = 0;
+
+      // TODO(bgruber): use a fold over comma in C++17
+      // Order of evaluation is left-to-right
+      int dummy[] = {(bulk_copy_tile(bar, tile_stride, smem, smem_offset, total_copied, offset, aligned_ptrs), 0)...,
+                     0};
+      (void) dummy;
+
+      // TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet
+      ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied);
+    }
+
+    // all threads wait for bulk copy
+    __syncthreads();
+    while (!ptx::mbarrier_try_wait_parity(&bar, 0))
+      ;
+  }
+  else
+  {
+    // use all threads to schedule an async_memcpy
+    int smem_offset = 0;
+
+    // TODO(bgruber): use a fold over comma in C++17
+    // Order of evaluation is left-to-right
+    int dummy[] = {(bulk_copy_tile_fallback(tile_size, tile_stride, smem, smem_offset, offset, aligned_ptrs), 0)..., 0};
+    (void) dummy;
+
+    cooperative_groups::wait(cooperative_groups::this_thread_block());
+  }
+
+  // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
+  out += offset;
+
+  // note: I tried expressing the UBLKCP_AGENT as a function object but it adds a lot of code to handle the variadics
+  // TODO(bgruber): use a polymorphic lambda in C++14
+#  define UBLKCP_AGENT(full_tile)                                                                                 \
+    _Pragma("unroll 1") /* Unroll 1 tends to improve performance, especially for smaller data types (confirmed by \
+                             benchmark) */                                                                        \
+      for (int j = 0; j < num_elem_per_thread; ++j)                                                               \
+    {                                                                                                             \
+      const int idx = j * block_dim + threadIdx.x;                                                                \
+      if (full_tile || idx < tile_size)                                                                           \
+      {                                                                                                           \
+        int smem_offset = 0;                                                                                      \
+        /* need to expand into a tuple for guaranteed order of evaluation*/                                       \
+        out[idx] = poor_apply(                                                                                    \
+          [&](const InTs&... values) {                                                                            \
+            return f(values...);                                                                                  \
+          },                                                                                                      \
+          ::cuda::std::tuple<InTs...>{fetch_operand(tile_stride, smem, smem_offset, idx, aligned_ptrs)...});      \
+      }                                                                                                           \
+    }
+  if (tile_stride == tile_size)
+  {
+    UBLKCP_AGENT(true);
+  }
+  else
+  {
+    UBLKCP_AGENT(false);
+  }
+#  undef UBLKCP_AGENT
+}
+
+template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
+_CCCL_DEVICE void transform_kernel_impl(
+  ::cuda::std::integral_constant<Algorithm, Algorithm::ublkcp>,
+  Offset num_items,
+  int num_elem_per_thread,
+  F f,
+  RandomAccessIteratorOut out,
+  aligned_base_ptr<InTs>... aligned_ptrs)
+{
+  // only call the real kernel for sm90 and later
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (transform_kernel_ublkcp<BulkCopyPolicy>(num_items, num_elem_per_thread, f, out, aligned_ptrs...);));
+}
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+template <typename It>
+union kernel_arg
+{
+  aligned_base_ptr<value_t<It>> aligned_ptr;
+  It iterator;
+
+  _CCCL_HOST_DEVICE kernel_arg() {} // in case It is not default-constructible
+};
+
+template <typename It>
+_CCCL_HOST_DEVICE auto make_iterator_kernel_arg(It it) -> kernel_arg<It>
+{
+  kernel_arg<It> arg;
+  arg.iterator = it;
+  return arg;
+}
+
+template <typename It>
+_CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) -> kernel_arg<It>
+{
+  kernel_arg<It> arg;
+  arg.aligned_ptr = make_aligned_base_ptr(ptr, alignment);
+  return arg;
+}
+
+// TODO(bgruber): make a variable template in C++14
+template <Algorithm Alg>
+using needs_aligned_ptr_t =
+  ::cuda::std::bool_constant<false
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+                             || Alg == Algorithm::ublkcp
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+                             >;
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+template <Algorithm Alg, typename It, ::cuda::std::__enable_if_t<needs_aligned_ptr_t<Alg>::value, int> = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg(
+  ::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg) -> aligned_base_ptr<value_t<It>>&&
+{
+  return ::cuda::std::move(arg.aligned_ptr);
+}
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+template <Algorithm Alg, typename It, ::cuda::std::__enable_if_t<!needs_aligned_ptr_t<Alg>::value, int> = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto
+select_kernel_arg(::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg) -> It&&
+{
+  return ::cuda::std::move(arg.iterator);
+}
+
+// There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated
+// with the same arguments for each algorithm. Only the device compiler will then select the implementation. This
+// saves some compile-time and binary size.
+template <typename MaxPolicy,
+          typename Offset,
+          typename F,
+          typename RandomAccessIteratorOut,
+          typename... RandomAccessIteartorsIn>
+__launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
+    Offset num_items,
+    int num_elem_per_thread,
+    F f,
+    RandomAccessIteratorOut out,
+    kernel_arg<RandomAccessIteartorsIn>... ins)
+{
+  constexpr auto alg = ::cuda::std::integral_constant<Algorithm, MaxPolicy::ActivePolicy::algorithm>{};
+  transform_kernel_impl<typename MaxPolicy::ActivePolicy::algo_policy>(
+    alg,
+    num_items,
+    num_elem_per_thread,
+    ::cuda::std::move(f),
+    ::cuda::std::move(out),
+    select_kernel_arg(alg, ::cuda::std::move(ins))...);
+}
+
+constexpr int arch_to_min_bytes_in_flight(int sm_arch)
+{
+  // TODO(bgruber): use if-else in C++14 for better readability
+  return sm_arch >= 900 ? 48 * 1024 // 32 for H100, 48 for H200
+       : sm_arch >= 800 ? 16 * 1024 // A100
+                        : 12 * 1024; // V100 and below
+}
+
+template <typename... RandomAccessIteratorsIn>
+_CCCL_HOST_DEVICE constexpr auto bulk_copy_smem_for_tile_size(int tile_size) -> int
+{
+  return round_up_to_po2_multiple(int{sizeof(int64_t)}, bulk_copy_alignment) /* bar */
+       // 128 bytes of padding for each input tile (handles before + after)
+       + tile_size * loaded_bytes_per_iteration<RandomAccessIteratorsIn...>()
+       + sizeof...(RandomAccessIteratorsIn) * bulk_copy_alignment;
+}
+
+using fallback_for_policy = for_each::policy_hub_t::policy_350_t::for_policy_t;
+
+template <bool RequiresStableAddress, typename RandomAccessIteratorTupleIn>
+struct policy_hub
+{
+  static_assert(sizeof(RandomAccessIteratorTupleIn) == 0, "Second parameter must be a tuple");
+};
+
+template <bool RequiresStableAddress, typename... RandomAccessIteratorsIn>
+struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIteratorsIn...>>
+{
+  static constexpr bool no_input_streams = sizeof...(RandomAccessIteratorsIn) == 0;
+  static constexpr bool all_contiguous =
+    ::cuda::std::conjunction<THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorsIn>...>::value;
+  static constexpr bool all_values_trivially_reloc =
+    ::cuda::std::conjunction<THRUST_NS_QUALIFIER::is_trivially_relocatable<value_t<RandomAccessIteratorsIn>>...>::value;
+
+  static constexpr bool can_memcpy = all_contiguous && all_values_trivially_reloc;
+
+  // TODO(bgruber): consider a separate kernel for just filling
+
+  struct policy300 : ChainedPolicy<300, policy300, policy300>
+  {
+    static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
+    // TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
+    static constexpr auto algorithm = Algorithm::fallback_for;
+    using algo_policy               = fallback_for_policy;
+  };
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  // H100 and H200
+  struct policy900 : ChainedPolicy<900, policy900, policy300>
+  {
+    static constexpr int min_bif = arch_to_min_bytes_in_flight(900);
+    using async_policy           = async_copy_policy_t<256>;
+    static constexpr bool exhaust_smem =
+      bulk_copy_smem_for_tile_size<RandomAccessIteratorsIn...>(
+        async_policy::block_threads * async_policy::min_items_per_thread)
+      > 48 * 1024;
+    static constexpr bool any_type_is_overalinged =
+#  if _CCCL_STD_VER >= 2017
+      ((alignof(value_t<RandomAccessIteratorsIn>) > bulk_copy_alignment) || ...);
+#  else
+      sum((alignof(value_t<RandomAccessIteratorsIn>) > bulk_copy_alignment)...) > 0;
+#  endif
+
+    static constexpr bool use_fallback =
+      RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged;
+    static constexpr auto algorithm = use_fallback ? Algorithm::fallback_for : Algorithm::ublkcp;
+    using algo_policy               = ::cuda::std::_If<use_fallback, fallback_for_policy, async_policy>;
+  };
+
+  using max_policy = policy900;
+#else // _CUB_HAS_TRANSFORM_UBLKCP
+  using max_policy = policy300;
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+};
+
+// TODO(bgruber): replace by ::cuda::std::expected in C++14
+template <typename T>
+struct PoorExpected
+{
+  alignas(T) char storage[sizeof(T)];
+  cudaError_t error;
+
+  _CCCL_HOST_DEVICE PoorExpected(T value)
+      : error(cudaSuccess)
+  {
+    new (storage) T(::cuda::std::move(value));
+  }
+
+  _CCCL_HOST_DEVICE PoorExpected(cudaError_t error)
+      : error(error)
+  {}
+
+  _CCCL_HOST_DEVICE explicit operator bool() const
+  {
+    return error == cudaSuccess;
+  }
+
+  _CCCL_HOST_DEVICE T& operator*()
+  {
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing")
+    return reinterpret_cast<T&>(storage);
+    _CCCL_DIAG_POP
+  }
+
+  _CCCL_HOST_DEVICE const T& operator*() const
+  {
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing")
+    return reinterpret_cast<const T&>(storage);
+    _CCCL_DIAG_POP
+  }
+
+  _CCCL_HOST_DEVICE T* operator->()
+  {
+    return &**this;
+  }
+
+  _CCCL_HOST_DEVICE const T* operator->() const
+  {
+    return &**this;
+  }
+};
+
+// TODO(bgruber): this is very similar to thrust::cuda_cub::core::get_max_shared_memory_per_block. We should unify this.
+_CCCL_HOST_DEVICE inline PoorExpected<int> get_max_shared_memory()
+{
+  //  gevtushenko promised me that I can assume that the stream passed to the CUB API entry point (where the kernels
+  //  will later be launched on) belongs to the currently active device. So we can just query the active device here.
+  int device = 0;
+  auto error = CubDebug(cudaGetDevice(&device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  int max_smem = 0;
+  error        = CubDebug(cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlock, device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  return max_smem;
+}
+
+struct elem_counts
+{
+  int elem_per_thread;
+  int tile_size;
+  int smem_size;
+};
+
+template <bool RequiresStableAddress,
+          typename Offset,
+          typename RandomAccessIteratorTupleIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename PolicyHub = policy_hub<RequiresStableAddress, RandomAccessIteratorTupleIn>>
+struct dispatch_t;
+
+template <bool RequiresStableAddress,
+          typename Offset,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename PolicyHub>
+struct dispatch_t<RequiresStableAddress,
+                  Offset,
+                  ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                  RandomAccessIteratorOut,
+                  TransformOp,
+                  PolicyHub>
+{
+  static_assert(::cuda::std::is_same<Offset, ::cuda::std::int32_t>::value
+                  || ::cuda::std::is_same<Offset, ::cuda::std::int64_t>::value,
+                "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types");
+
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> in;
+  RandomAccessIteratorOut out;
+  Offset num_items;
+  TransformOp op;
+  cudaStream_t stream;
+
+#define CUB_DETAIL_TRANSFORM_KERNEL_PTR             \
+  &transform_kernel<typename PolicyHub::max_policy, \
+                    Offset,                         \
+                    TransformOp,                    \
+                    RandomAccessIteratorOut,        \
+                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorsIn>...>
+
+  static constexpr int loaded_bytes_per_iter = loaded_bytes_per_iteration<RandomAccessIteratorsIn...>();
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  // TODO(bgruber): I want to write tests for this but those are highly depending on the architecture we are running
+  // on?
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel()
+    -> PoorExpected<
+      ::cuda::std::
+        tuple<THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron, decltype(CUB_DETAIL_TRANSFORM_KERNEL_PTR), int>>
+  {
+    using policy_t          = typename ActivePolicy::algo_policy;
+    constexpr int block_dim = policy_t::block_threads;
+    static_assert(block_dim % bulk_copy_alignment == 0,
+                  "block_threads needs to be a multiple of bulk_copy_alignment (128)"); // then tile_size is a multiple
+                                                                                        // of 128-byte
+
+    auto determine_element_counts = [&]() -> PoorExpected<elem_counts> {
+      const auto max_smem = get_max_shared_memory();
+      if (!max_smem)
+      {
+        return max_smem.error;
+      }
+
+      elem_counts last_counts{};
+      // Increase the number of output elements per thread until we reach the required bytes in flight.
+      static_assert(policy_t::min_items_per_thread <= policy_t::max_items_per_thread, ""); // ensures the loop below
+      // runs at least once
+      for (int elem_per_thread = +policy_t::min_items_per_thread; elem_per_thread < +policy_t::max_items_per_thread;
+           ++elem_per_thread)
+      {
+        const int tile_size = block_dim * elem_per_thread;
+        const int smem_size = bulk_copy_smem_for_tile_size<RandomAccessIteratorsIn...>(tile_size);
+        if (smem_size > *max_smem)
+        {
+#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+          // assert should be prevented by smem check in policy
+          assert(last_counts.elem_per_thread > 0 && "min_items_per_thread exceeds available shared memory");
+#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+          return last_counts;
+        }
+
+        if (tile_size >= num_items)
+        {
+          return elem_counts{elem_per_thread, tile_size, smem_size};
+        }
+
+        int max_occupancy = 0;
+        const auto error =
+          CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, smem_size));
+        if (error != cudaSuccess)
+        {
+          return error;
+        }
+
+        const int bytes_in_flight_SM = max_occupancy * tile_size * loaded_bytes_per_iter;
+        if (ActivePolicy::min_bif <= bytes_in_flight_SM)
+        {
+          return elem_counts{elem_per_thread, tile_size, smem_size};
+        }
+
+        last_counts = elem_counts{elem_per_thread, tile_size, smem_size};
+      }
+      return last_counts;
+    };
+    PoorExpected<elem_counts> config = [&]() {
+      NV_IF_TARGET(
+        NV_IS_HOST,
+        (
+          // this static variable exists for each template instantiation of the surrounding function and class, on which
+          // the chosen element count solely depends (assuming max SMEM is constant during a program execution)
+          static auto cached_config = determine_element_counts(); return cached_config;),
+        (
+          // we cannot cache the determined element count in device code
+          return determine_element_counts();));
+    }();
+    if (!config)
+    {
+      return config.error;
+    }
+#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+    assert(config->elem_per_thread > 0);
+    assert(config->tile_size > 0);
+    assert(config->tile_size % bulk_copy_alignment == 0);
+    assert((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0)); // logical xor
+#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+
+    const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{config->tile_size}));
+    return ::cuda::std::make_tuple(
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, config->smem_size, stream),
+      CUB_DETAIL_TRANSFORM_KERNEL_PTR,
+      config->elem_per_thread);
+  }
+
+  template <typename ActivePolicy, std::size_t... Is>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  invoke_algorithm(cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::ublkcp>)
+  {
+    auto ret = configure_ublkcp_kernel<ActivePolicy>();
+    if (!ret)
+    {
+      return ret.error;
+    }
+    // TODO(bgruber): use a structured binding in C++17
+    // auto [launcher, kernel, elem_per_thread] = *ret;
+
+    return ::cuda::std::get<0>(*ret).doit(
+      ::cuda::std::get<1>(*ret),
+      num_items,
+      ::cuda::std::get<2>(*ret),
+      op,
+      out,
+      make_aligned_base_ptr_kernel_arg(
+        THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)), bulk_copy_alignment)...);
+  }
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+  template <typename ActivePolicy, std::size_t... Is>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  invoke_algorithm(cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::fallback_for>)
+  {
+    constexpr int block_threads    = ActivePolicy::algo_policy::block_threads;
+    constexpr int items_per_thread = ActivePolicy::algo_policy::items_per_thread;
+    constexpr int tile_size        = block_threads * items_per_thread;
+    const auto grid_dim            = static_cast<unsigned>(::cuda::ceil_div(num_items, Offset{tile_size}));
+    return CubDebug(
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_threads, 0, stream)
+        .doit(
+          CUB_DETAIL_TRANSFORM_KERNEL_PTR,
+          num_items,
+          items_per_thread,
+          op,
+          out,
+          make_iterator_kernel_arg(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)))...));
+  }
+
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    // // TODO(bgruber): replace the overload set by if constexpr in C++17
+    return invoke_algorithm<ActivePolicy>(::cuda::std::index_sequence_for<RandomAccessIteratorsIn...>{},
+                                          ::cuda::std::integral_constant<Algorithm, ActivePolicy::algorithm>{});
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> in,
+    RandomAccessIteratorOut out,
+    Offset num_items,
+    TransformOp op,
+    cudaStream_t stream)
+  {
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    int ptx_version = 0;
+    auto error      = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    dispatch_t dispatch{::cuda::std::move(in), ::cuda::std::move(out), num_items, ::cuda::std::move(op), stream};
+    return CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch));
+  }
+
+#undef CUB_DETAIL_TRANSFORM_KERNEL_PTR
+};
+} // namespace transform
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/kernels/for_each.cuh b/cub/cub/device/dispatch/kernels/for_each.cuh
new file mode 100644
index 0000000000..2213252d2f
--- /dev/null
+++ b/cub/cub/device/dispatch/kernels/for_each.cuh
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_for.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace for_each
+{
+
+template <class Fn>
+struct first_parameter
+{
+  using type = void;
+};
+
+template <class C, class R, class A>
+struct first_parameter<R (C::*)(A)>
+{
+  using type = A;
+};
+
+template <class C, class R, class A>
+struct first_parameter<R (C::*)(A) const>
+{
+  using type = A;
+};
+
+template <class Fn>
+using first_parameter_t = typename first_parameter<decltype(&Fn::operator())>::type;
+
+template <class Value, class Fn, class = void>
+struct has_unique_value_overload : ::cuda::std::false_type
+{};
+
+// clang-format off
+template <class Value, class Fn>
+struct has_unique_value_overload<
+  Value,
+  Fn,
+  typename ::cuda::std::enable_if<
+              !::cuda::std::is_reference<first_parameter_t<Fn>>::value &&
+              ::cuda::std::is_convertible<Value, first_parameter_t<Fn>
+             >::value>::type>
+    : ::cuda::std::true_type
+{};
+
+// For trivial types, foreach is not allowed to copy values, even if those are trivially copyable.
+// This can be observable if the unary operator takes parameter by reference and modifies it or uses address.
+// The trait below checks if the freedom to copy trivial types can be regained.
+template <typename Value, typename Fn>
+using can_regain_copy_freedom =
+  ::cuda::std::integral_constant<
+    bool,
+    ::cuda::std::is_trivially_constructible<Value>::value &&
+    ::cuda::std::is_trivially_copy_assignable<Value>::value &&
+    :: cuda::std::is_trivially_move_assignable<Value>::value &&
+    ::cuda::std::is_trivially_destructible<Value>::value &&
+    has_unique_value_overload<Value, Fn>::value>;
+// clang-format on
+
+// This kernel is used when the block size is not known at compile time
+template <class ChainedPolicyT, class OffsetT, class OpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
+{
+  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
+  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
+
+  const auto block_threads  = static_cast<OffsetT>(blockDim.x);
+  const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+  const auto tile_base      = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
+  const auto num_remaining  = num_items - tile_base;
+  const auto items_in_tile  = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+// This kernel is used when the block size is known at compile time
+template <class ChainedPolicyT, class OffsetT, class OpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES //
+__launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
+  void static_kernel(OffsetT num_items, OpT op)
+{
+  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
+  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
+
+  constexpr auto block_threads  = active_policy_t::block_threads;
+  constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+
+  const auto tile_base     = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
+  const auto num_remaining = num_items - tile_base;
+  const auto items_in_tile = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+} // namespace for_each
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 21ed8592d6..4df4b49ac0 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -47,14 +47,15 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/type_traits.cuh> // always_false
 #include <cub/util_cpp_dialect.cuh>
 #include <cub/util_type.cuh>
 
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-#include <cuda/std/functional>
-_CCCL_SUPPRESS_DEPRECATED_POP
-#include <cuda/std/type_traits>
-#include <cuda/std/utility>
+#include <cuda/std/functional> // cuda::std::plus
+#include <cuda/std/type_traits> // cuda::std::common_type
+#include <cuda/std/utility> // cuda::std::forward
+
+// #include <functional> // std::plus
 
 CUB_NAMESPACE_BEGIN
 
@@ -413,4 +414,121 @@ _CCCL_HOST_DEVICE BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT binary_op)
   return BinaryFlip<BinaryOpT>(binary_op);
 }
 
+namespace internal
+{
+// TODO: Remove DPX specilization when nvbug 4823237 is fixed
+
+template <typename T>
+struct DpxMin
+{
+  static_assert(detail::always_false<T>(), "DpxMin is not supported for this type");
+};
+
+template <>
+struct DpxMin<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmins2(a, b);
+  }
+};
+
+template <>
+struct DpxMin<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vminu2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+struct DpxMax
+{
+  static_assert(detail::always_false<T>(), "DpxMax is not supported for this type");
+};
+
+template <>
+struct DpxMax<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxs2(a, b);
+  }
+};
+
+template <>
+struct DpxMax<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxu2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+struct DpxSum
+{
+  static_assert(detail::always_false<T>(), "DpxSum is not supported for this type");
+};
+
+template <>
+struct DpxSum<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vadd2(a, b);
+  }
+};
+
+template <>
+struct DpxSum<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vadd2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename ReduceOp, typename T>
+struct CubOperatorToDpx
+{
+  static_assert(detail::always_false<T>(), "Dpx is not supported for this operator");
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Min, T>
+{
+  using type = DpxMin<T>;
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Max, T>
+{
+  using type = DpxMax<T>;
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Sum, T>
+{
+  using type = DpxSum<T>;
+};
+
+// template <typename T>
+// struct CubOperatorToDpx<std::plus<T>, T>
+//{
+//   using type = DpxSum<T>;
+// };
+
+template <typename ReduceOp, typename T>
+using cub_operator_to_dpx_t = CubOperatorToDpx<ReduceOp, T>;
+
+} // namespace internal
+
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index db7e242779..a956321f78 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -28,7 +28,7 @@
 
 /**
  * @file
- * Thread utilities for sequential reduction over statically-sized array types
+ * Thread reduction over statically-sized array-like types
  */
 
 #pragma once
@@ -43,8 +43,17 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/type_traits.cuh>
-#include <cub/thread/thread_operators.cuh>
+#include <cub/detail/type_traits.cuh> // are_same()
+#include <cub/thread/thread_operators.cuh> // cub_operator_to_dpx_t
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/bit> // bit_cast
+#include <cuda/std/cstdint> // uint16_t
+#include <cuda/std/functional> // cuda::std::plus
+#include <cuda/std/utility> // pair
+
+// #include <functional> // std::plus
 
 CUB_NAMESPACE_BEGIN
 
@@ -52,51 +61,143 @@ CUB_NAMESPACE_BEGIN
 namespace internal
 {
 
-/**
- * @brief Sequential reduction over statically-sized array types
- *
- * @param[in] input
- *   Input array
- *
- * @param[in] reduction_op
- *   Binary reduction operator
- *
- * @param[in] prefix
- *   Prefix to seed reduction with
- */
-template <int LENGTH,
-          typename T,
-          typename ReductionOp,
-          typename PrefixT,
-          typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
-_CCCL_DEVICE _CCCL_FORCEINLINE AccumT
-ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH> /*length*/)
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/// DPX instructions compute min, max, and sum for up to three 16 and 32-bit signed or unsigned integer parameters
+/// see DPX documetation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dpx
+/// NOTE: The compiler is able to automatically vectorize all cases with 3 operands
+///       However, all other cases with per-halfword comparison need to be explicitly vectorized
+/// TODO: Remove DPX specilization when nvbug 4823237 is fixed
+///
+/// DPX reduction is enabled if the following conditions are met:
+/// - Hopper+ architectures. DPX instructions are emulated before Hopper
+/// - The number of elements must be large enough for performance reasons (see below)
+/// - All types must be the same
+/// - Only works with integral types of 2 bytes
+/// - DPX instructions provide Min, Max, and Sum SIMD operations
+/// If the number of instructions is the same, we favor the compiler
+
+template <typename Input, typename ReductionOp, typename AccumT>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // clang-format off
+constexpr bool enable_dpx_reduction()
 {
-  AccumT retval = prefix;
+  using T = decltype(::cuda::std::declval<Input>()[0]);
+  // TODO: use constexpr variable in C++14+
+  using Lenght = ::cuda::std::integral_constant<int, detail::static_size<Input>()>;
+  return ((Lenght{} >= 9 && detail::are_same<ReductionOp, cub::Sum/*, std::plus<T>*/>()) || Lenght{} >= 10)
+            && detail::are_same<T, AccumT>()
+            && detail::is_one_of<T, int16_t, uint16_t>()
+            && detail::is_one_of<ReductionOp, cub::Min, cub::Max, cub::Sum/*, std::plus<T>*/>();
+}
+// clang-format on
 
-#pragma unroll
-  for (int i = 0; i < LENGTH; ++i)
+// Considering compiler vectorization with 3-way comparison, the number of SASS instructions is
+// Standard: ceil((L - 3) / 2) + 1
+//   replacing L with L/2 for SIMD
+// DPX:      ceil((L/2 - 3) / 2) + 1 + 2 [for halfword comparison: PRMT, VIMNMX] + L % 2 [for last element]
+//   finally, the last two comparision operations are vectorized in a 3-way reduction
+//           ceil((L/2 - 3) / 2) + 3
+//
+// length | Standard |  DPX
+//  2     |    1     |  NA
+//  3     |    1     |  NA
+//  4     |    2     |  3
+//  5     |    2     |  3
+//  6     |    3     |  3
+//  7     |    3     |  3
+//  8     |    4     |  4
+//  9     |    4     |  4
+// 10     |    5     |  4 // ***
+// 11     |    5     |  4 // ***
+// 12     |    6     |  5 // ***
+// 13     |    6     |  5 // ***
+// 14     |    7     |  5 // ***
+// 15     |    7     |  5 // ***
+// 16     |    8     |  6 // ***
+
+template <typename AccumT, typename Input, typename ReductionOp>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+ThreadReduceSequential(const Input& input, ReductionOp reduction_op)
+{
+  AccumT retval = input[0];
+#  pragma unroll
+  for (int i = 1; i < detail::static_size<Input>(); ++i)
   {
     retval = reduction_op(retval, input[i]);
   }
-
   return retval;
 }
 
+/// Specialization for DPX reduction
+template <typename Input, typename ReductionOp>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto
+ThreadReduceDpx(const Input& input, ReductionOp reduction_op) -> ::cuda::std::__remove_cvref_t<decltype(input[0])>
+{
+  using T              = ::cuda::std::__remove_cvref_t<decltype(input[0])>;
+  constexpr int length = detail::static_size<Input>();
+  T array[length];
+#  pragma unroll
+  for (int i = 0; i < length; ++i)
+  {
+    array[i] = input[i];
+  }
+  using DpxReduceOp   = cub_operator_to_dpx_t<ReductionOp, T>;
+  using SimdType      = ::cuda::std::pair<T, T>;
+  auto unsigned_input = reinterpret_cast<const unsigned*>(array);
+  auto simd_reduction = ThreadReduceSequential<length / 2>(unsigned_input, DpxReduceOp{});
+  auto simd_values    = ::cuda::std::bit_cast<SimdType>(simd_reduction);
+  auto ret_value      = reduction_op(simd_values.first, simd_values.second);
+  return (length % 2 == 0) ? ret_value : reduction_op(ret_value, input[length - 1]);
+}
+
+// DPX/Sequential dispatch
+template <typename Input,
+          typename ReductionOp,
+          typename ValueT = ::cuda::std::__remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT>,
+          _CUB_TEMPLATE_REQUIRES(enable_dpx_reduction<Input, ReductionOp, AccumT>())>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op)
+{
+  static_assert(sizeof(Input) != sizeof(Input), "a");
+  static_assert(detail::has_subscript<Input>::value, "Input must support the subscript operator[]");
+  static_assert(detail::has_size<Input>::value, "Input must have the size() method");
+  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (return ThreadReduceDpx(input, reduction_op);),
+               (return ThreadReduceSequential<AccumT>(input, reduction_op);))
+}
+
+template <typename Input,
+          typename ReductionOp,
+          typename ValueT = ::cuda::std::__remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT>,
+          _CUB_TEMPLATE_REQUIRES(!enable_dpx_reduction<Input, ReductionOp, AccumT>())>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op)
+{
+  static_assert(detail::has_subscript<Input>::value, "Input must support the subscript operator[]");
+  static_assert(detail::has_size<Input>::value, "Input must have the size() method");
+  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
+  return ThreadReduceSequential<AccumT>(input, reduction_op);
+}
+
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+
 /**
- * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array,
- *        seeded with the specified @p prefix. The aggregate is returned.
+ * @brief Reduction over statically-sized array-like types, seeded with the specified @p prefix.
  *
- * @tparam LENGTH
- *   LengthT of input array
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to be reduced.
+ * @tparam Input
+ *   <b>[inferred]</b> The data type to be reduced having member
+ *   <tt>operator[](int i)</tt> and must be statically-sized (size() method or static array)
  *
  * @tparam ReductionOp
  *   <b>[inferred]</b> Binary reduction operator type having member
  *   <tt>T operator()(const T &a, const T &b)</tt>
  *
+ * @tparam PrefixT
+ *   <b>[inferred]</b> The prefix type
+ *
  * @param[in] input
  *   Input array
  *
@@ -105,101 +206,122 @@ ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH
  *
  * @param[in] prefix
  *   Prefix to seed reduction with
+ *
+ * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, ValueT, PrefixT></tt>
  */
-template <int LENGTH,
-          typename T,
+template <typename Input,
           typename ReductionOp,
           typename PrefixT,
-          typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
-_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix)
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+          typename ValueT = ::cuda::std::__remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT, PrefixT>>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+ThreadReduce(const Input& input, ReductionOp reduction_op, PrefixT prefix)
 {
-  return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+  static_assert(detail::has_subscript<Input>::value, "Input must support the subscript operator[]");
+  static_assert(detail::has_size<Input>::value, "Input must have the size() method");
+  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
+  constexpr int length = detail::static_size<Input>();
+  // copy to a temporary array of type AccumT
+  AccumT array[length + 1];
+  array[0] = prefix;
+#pragma unroll
+  for (int i = 0; i < length; ++i)
+  {
+    array[i + 1] = input[i];
+  }
+  return ThreadReduce<decltype(array), ReductionOp, AccumT, AccumT>(array, reduction_op);
 }
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
 /**
- * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array.
- *        The aggregate is returned.
+ * @remark The pointer interface adds little value and requires Length to be explicit.
+ *         Prefer using the array-like interface
  *
- * @tparam LENGTH
- *   LengthT of input array
+ * @brief Perform a sequential reduction over @p length elements of the @p input pointer. The aggregate is returned.
  *
  * @tparam T
- *   <b>[inferred]</b> The data type to be reduced.
+ *   <b>[inferred]</b> The data type to be reduced
  *
  * @tparam ReductionOp
  *   <b>[inferred]</b> Binary reduction operator type having member
  *   <tt>T operator()(const T &a, const T &b)</tt>
  *
  * @param[in] input
- *   Input array
+ *   Input pointer
  *
  * @param[in] reduction_op
  *   Binary reduction operator
+ *
+ * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, T></tt>
  */
-template <int LENGTH, typename T, typename ReductionOp>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T* input, ReductionOp reduction_op)
+template <int Length, typename T, typename ReductionOp, typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T>>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const T* input, ReductionOp reduction_op)
 {
-  T prefix = input[0];
-  return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+  static_assert(Length > 0, "Length must be greater than 0");
+  static_assert(detail::has_binary_call_operator<ReductionOp, T>::value,
+                "ReductionOp must have the binary call operator: operator(V1, V2)");
+  using ArrayT = T[Length];
+  auto array   = reinterpret_cast<const T(*)[Length]>(input);
+  return ThreadReduce(*array, reduction_op);
 }
 
 /**
- * @brief Perform a sequential reduction over the statically-sized @p input array,
- *        seeded with the specified @p prefix. The aggregate is returned.
+ * @remark The pointer interface adds little value and requires Length to be explicit.
+ *         Prefer using the array-like interface
+ *
+ * @brief Perform a sequential reduction over @p length elements of the @p input pointer, seeded with the specified @p
+ *        prefix. The aggregate is returned.
  *
- * @tparam LENGTH
- *   <b>[inferred]</b> LengthT of @p input array
+ * @tparam length
+ *   Length of input pointer
  *
  * @tparam T
- *   <b>[inferred]</b> The data type to be reduced.
+ *   <b>[inferred]</b> The data type to be reduced
  *
  * @tparam ReductionOp
  *   <b>[inferred]</b> Binary reduction operator type having member
  *   <tt>T operator()(const T &a, const T &b)</tt>
  *
+ * @tparam PrefixT
+ *   <b>[inferred]</b> The prefix type
+ *
  * @param[in] input
- *   Input array
+ *   Input pointer
  *
  * @param[in] reduction_op
  *   Binary reduction operator
  *
  * @param[in] prefix
  *   Prefix to seed reduction with
+ *
+ * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, T, PrefixT></tt>
  */
-template <int LENGTH,
+template <int Length,
           typename T,
           typename ReductionOp,
           typename PrefixT,
-          typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
-_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix)
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>,
+          _CUB_TEMPLATE_REQUIRES(Length > 0)>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+ThreadReduce(const T* input, ReductionOp reduction_op, PrefixT prefix)
 {
-  return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+  static_assert(detail::has_binary_call_operator<ReductionOp, T>::value,
+                "ReductionOp must have the binary call operator: operator(V1, V2)");
+  auto array = reinterpret_cast<const T(*)[Length]>(input);
+  return ThreadReduce(*array, reduction_op, prefix);
 }
 
-/**
- * @brief Serial reduction with the specified operator
- *
- * @tparam LENGTH
- *   <b>[inferred]</b> LengthT of @p input array
- *
- * @tparam T
- *   <b>[inferred]</b> The data type to be reduced.
- *
- * @tparam ReductionOp
- *   <b>[inferred]</b> Binary reduction operator type having member
- *   <tt>T operator()(const T &a, const T &b)</tt>
- *
- * @param[in] input
- *   Input array
- *
- * @param[in] reduction_op
- *   Binary reduction operator
- */
-template <int LENGTH, typename T, typename ReductionOp>
-_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op)
+template <int Length, typename T, typename ReductionOp, typename PrefixT, _CUB_TEMPLATE_REQUIRES(Length == 0)>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(const T*, ReductionOp, PrefixT prefix)
 {
-  return ThreadReduce<LENGTH>((T*) input, reduction_op);
+  return prefix;
 }
 
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+
 } // namespace internal
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh
index b9ef4f1f6c..5f8780620f 100644
--- a/cub/cub/util_arch.cuh
+++ b/cub/cub/util_arch.cuh
@@ -43,7 +43,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/util_cpp_dialect.cuh>
+#include <cub/util_cpp_dialect.cuh> // IWYU pragma: export
 #include <cub/util_macro.cuh>
 #include <cub/util_namespace.cuh>
 
@@ -136,13 +136,21 @@ static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
 #    define CUB_PTX_PREFER_CONFLICT_OVER_PADDING     CUB_PREFER_CONFLICT_OVER_PADDING(0)
 #  endif
 
+namespace detail
+{
+// The maximum amount of static shared memory available per thread block
+// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB
+static constexpr ::cuda::std::size_t max_smem_per_block = 48 * 1024;
+} // namespace detail
+
 template <int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T>
 struct RegBoundScaling
 {
   enum
   {
     ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
-    BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    BLOCK_THREADS    = CUB_MIN(NOMINAL_4B_BLOCK_THREADS,
+                            ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
   };
 };
 
@@ -153,7 +161,8 @@ struct MemBoundScaling
   {
     ITEMS_PER_THREAD =
       CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
-    BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS,
+                            ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
   };
 };
 
diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh
index ef0c178c56..cae7565ec7 100644
--- a/cub/cub/util_compiler.cuh
+++ b/cub/cub/util_compiler.cuh
@@ -44,45 +44,67 @@
 #endif // no system header
 
 // enumerate host compilers we know about
+//! deprecated [Since 2.7]
 #define CUB_HOST_COMPILER_UNKNOWN 0
-#define CUB_HOST_COMPILER_MSVC    1
-#define CUB_HOST_COMPILER_GCC     2
-#define CUB_HOST_COMPILER_CLANG   3
+//! deprecated [Since 2.7]
+#define CUB_HOST_COMPILER_MSVC 1
+//! deprecated [Since 2.7]
+#define CUB_HOST_COMPILER_GCC 2
+//! deprecated [Since 2.7]
+#define CUB_HOST_COMPILER_CLANG 3
 
 // enumerate device compilers we know about
+//! deprecated [Since 2.7]
 #define CUB_DEVICE_COMPILER_UNKNOWN 0
-#define CUB_DEVICE_COMPILER_MSVC    1
-#define CUB_DEVICE_COMPILER_GCC     2
-#define CUB_DEVICE_COMPILER_NVCC    3
-#define CUB_DEVICE_COMPILER_CLANG   4
+//! deprecated [Since 2.7]
+#define CUB_DEVICE_COMPILER_MSVC 1
+//! deprecated [Since 2.7]
+#define CUB_DEVICE_COMPILER_GCC 2
+//! deprecated [Since 2.7]
+#define CUB_DEVICE_COMPILER_NVCC 3
+//! deprecated [Since 2.7]
+#define CUB_DEVICE_COMPILER_CLANG 4
 
 // figure out which host compiler we're using
 #if defined(_CCCL_COMPILER_MSVC)
-#  define CUB_HOST_COMPILER     CUB_HOST_COMPILER_MSVC
-#  define CUB_MSVC_VERSION      _MSC_VER
-#  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
+//! deprecated [Since 2.7]
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
+//! deprecated [Since 2.7]
+#  define CUB_MSVC_VERSION _CCCL_MSVC_VERSION
+//! deprecated [Since 2.7]
+#  define CUB_MSVC_VERSION_FULL _CCCL_MSVC_VERSION_FULL
 #elif defined(_CCCL_COMPILER_CLANG)
+//! deprecated [Since 2.7]
 #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
-#  define CUB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+//! deprecated [Since 2.7]
+#  define CUB_CLANG_VERSION _CCCL_CLANG_VERSION
 #elif defined(_CCCL_COMPILER_GCC)
+//! deprecated [Since 2.7]
 #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
-#  define CUB_GCC_VERSION   (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+//! deprecated [Since 2.7]
+#  define CUB_GCC_VERSION   _CCCL_GCC_VERSION
 #endif
 
 // figure out which device compiler we're using
 #if defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_CUDA_COMPILER_NVHPC)
+//! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
 #elif defined(_CCCL_COMPILER_MSVC)
+//! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
 #elif defined(_CCCL_COMPILER_GCC)
+//! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
 #elif defined(_CCCL_COMPILER_CLANG)
 // CUDA-capable clang should behave similar to NVCC.
 #  if defined(_CCCL_CUDA_COMPILER_NVCC)
+//! deprecated [Since 2.7]
 #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
 #  else
+//! deprecated [Since 2.7]
 #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
 #  endif
 #else
+//! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
 #endif
diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh
index d5beca2f6b..e34e253d3e 100644
--- a/cub/cub/util_cpp_dialect.cuh
+++ b/cub/cub/util_cpp_dialect.cuh
@@ -25,7 +25,8 @@
  *
  ******************************************************************************/
 
-//! @file Detect the version of the C++ standard used by the compiler.
+//! @file
+//! Detect the version of the C++ standard used by the compiler.
 
 #pragma once
 
@@ -39,7 +40,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/util_compiler.cuh>
+#include <cub/util_compiler.cuh> // IWYU pragma: export
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
 
@@ -100,14 +101,14 @@
 #  ifndef CUB_IGNORE_DEPRECATED_COMPILER
 
 // Compiler checks:
-#    if defined(_CCCL_COMPILER_GCC) && CUB_GCC_VERSION < 50000
+#    if defined(_CCCL_COMPILER_GCC) && _CCCL_GCC_VERSION < 50000
 CUB_COMPILER_DEPRECATION(GCC 5.0);
-#    elif defined(_CCCL_COMPILER_CLANG) && CUB_CLANG_VERSION < 70000
+#    elif defined(_CCCL_COMPILER_CLANG) && _CCCL_CLANG_VERSION < 70000
 CUB_COMPILER_DEPRECATION(Clang 7.0);
-#    elif defined(_CCCL_COMPILER_MSVC) && CUB_MSVC_VERSION < 1910
+#    elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1910
 // <2017. Hard upgrade message:
 CUB_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20));
-#    elif defined(_CCCL_COMPILER_MSVC) && CUB_MSVC_VERSION < 1920
+#    elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1920
 // >=2017, <2019. Soft deprecation message:
 CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017);
 #    endif
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index 714aa014ce..7ea6dc3847 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -47,13 +47,13 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/device_synchronize.cuh>
+#include <cub/detail/device_synchronize.cuh> // IWYU pragma: export
 #include <cub/util_debug.cuh>
 #include <cub/util_type.cuh>
 // for backward compatibility
 #include <cub/util_temporary_storage.cuh>
 
-#include <cuda/std/__cuda/ensure_current_device.h>
+#include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh
index 162641d348..37d4ed05f3 100644
--- a/cub/cub/util_macro.cuh
+++ b/cub/cub/util_macro.cuh
@@ -42,8 +42,8 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/detect_cuda_runtime.cuh>
-#include <cub/util_namespace.cuh>
+#include <cub/detail/detect_cuda_runtime.cuh> // IWYU pragma: export
+#include <cub/util_namespace.cuh> // IWYU pragma: export
 
 #include <cuda/std/utility>
 
@@ -112,12 +112,12 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wattributes")
 #  if !defined(_CCCL_CUDA_COMPILER_NVHPC)
 _CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)
 #  endif // !_CCCL_CUDA_COMPILER_NVHPC
-#  if defined(_CCCL_COMPILER_ICC) || defined(_CCCL_COMPILER_ICC_LLVM)
+#  if defined(_CCCL_COMPILER_ICC)
 #    pragma nv_diag_suppress 1407 // the "__visibility__" attribute can only appear on functions and
                                   // variables with external linkage'
 #    pragma warning(disable : 1890) // the "__visibility__" attribute can only appear on functions and
                                     // variables with external linkage'
-#  endif // _CCCL_COMPILER_ICC || _CCCL_COMPILER_ICC_LLVM
+#  endif // _CCCL_COMPILER_ICC
 #endif // !CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index e23f6e6578..8ae4e2d05b 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -44,6 +44,7 @@
 #endif // no system header
 
 #include <cub/detail/uninitialized_copy.cuh>
+#include <cub/util_deprecated.cuh>
 
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh
index 6a0d6b9a94..d2e5541c09 100644
--- a/cub/cub/util_vsmem.cuh
+++ b/cub/cub/util_vsmem.cuh
@@ -42,6 +42,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/util_arch.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
@@ -67,10 +68,6 @@ struct vsmem_t
   void* gmem_ptr;
 };
 
-// The maximum amount of static shared memory available per thread block
-// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB
-static constexpr std::size_t max_smem_per_block = 48 * 1024;
-
 /**
  * @brief Class template that helps to prevent exceeding the available shared memory per thread block.
  *
diff --git a/cub/cub/version.cuh b/cub/cub/version.cuh
index e485eb3cd0..95755133e3 100644
--- a/cub/cub/version.cuh
+++ b/cub/cub/version.cuh
@@ -58,7 +58,7 @@
  *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>CUB_VERSION / 100000</tt> is the major version.
  */
-#define CUB_VERSION 200600 // macro expansion with ## requires this to be a single value
+#define CUB_VERSION 200700 // macro expansion with ## requires this to be a single value
 
 /*! \def CUB_MAJOR_VERSION
  *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index 41b23e6dff..fdd4083c37 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -127,7 +127,7 @@ struct WarpReduceShfl
   {
     enum
     {
-      /// Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per
+      /// Whether the data type is a small (32b or less) integer for which we can use a single SHFL instruction per
       /// exchange
       IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
     };
diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh
index bfcef99656..ac5c700b95 100644
--- a/cub/cub/warp/warp_load.cuh
+++ b/cub/cub/warp/warp_load.cuh
@@ -25,7 +25,8 @@
  *
  ******************************************************************************/
 
-//! @file Operations for reading linear tiles of data into the CUDA warp.
+//! @file
+//! Operations for reading linear tiles of data into the CUDA warp.
 
 #pragma once
 
diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh
index e123330ba1..bb99bc5965 100644
--- a/cub/cub/warp/warp_store.cuh
+++ b/cub/cub/warp/warp_store.cuh
@@ -25,7 +25,8 @@
  *
  ******************************************************************************/
 
-//! @file Operations for writing linear segments of data from the CUDA warp
+//! @file
+//! Operations for writing linear segments of data from the CUDA warp
 
 #pragma once
 
diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index 48a0142801..3ec8c94eef 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -261,6 +261,11 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
       target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists
     endif()
 
+    # enable lambdas for all API examples
+    if ("${test_target}" MATCHES "test.[A-Za-z0-9_]+_api")
+      target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+    endif()
+
     target_link_libraries(${test_target} PRIVATE
       ${cub_target}
       ${config_c2h_target}
diff --git a/cub/test/c2h/checked_allocator.cuh b/cub/test/c2h/checked_allocator.cuh
index 46f2601b1c..11d0d8ccbc 100644
--- a/cub/test/c2h/checked_allocator.cuh
+++ b/cub/test/c2h/checked_allocator.cuh
@@ -35,24 +35,62 @@
 #include <thrust/system/cuda/memory_resource.h>
 #include <thrust/system/cuda/pointer.h>
 
+#include <cstdlib>
+#include <iostream>
 #include <new>
 
-// #define DEBUG_CHECKED_ALLOC_FAILURE
-
-#ifdef DEBUG_CHECKED_ALLOC_FAILURE
-#  include <iostream>
-#endif
-
 namespace c2h
 {
 namespace detail
 {
 
+struct memory_info
+{
+  std::size_t free{};
+  std::size_t total{};
+  bool override{false};
+};
+
+// If the environment variable CCCL_DEVICE_MEMORY_LIMIT is set, the total device memory
+// will be limited to this number of bytes.
+inline std::size_t get_device_memory_limit()
+{
+  static const char* override_str = std::getenv("CCCL_DEVICE_MEMORY_LIMIT");
+  static std::size_t result       = override_str ? static_cast<std::size_t>(std::atoll(override_str)) : 0;
+  return result;
+}
+
+inline bool get_debug_checked_allocs()
+{
+  static const char* debug_checked_allocs = std::getenv("CCCL_DEBUG_CHECKED_ALLOC_FAILURES");
+  static bool result                      = debug_checked_allocs && (std::atoi(debug_checked_allocs) != 0);
+  return result;
+}
+
+inline cudaError_t get_device_memory(memory_info& info)
+{
+  static std::size_t device_memory_limit = get_device_memory_limit();
+
+  cudaError_t status = cudaMemGetInfo(&info.free, &info.total);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+
+  if (device_memory_limit > 0)
+  {
+    info.free  = (std::max)(std::size_t{0}, static_cast<std::size_t>(info.free - (info.total - device_memory_limit)));
+    info.total = device_memory_limit;
+    info.override = true;
+  }
+
+  return cudaSuccess;
+}
+
 inline cudaError_t check_free_device_memory(std::size_t bytes)
 {
-  std::size_t free_bytes{};
-  std::size_t total_bytes{};
-  cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes);
+  memory_info info;
+  cudaError_t status = get_device_memory(info);
   if (status != cudaSuccess)
   {
     return status;
@@ -60,20 +98,31 @@ inline cudaError_t check_free_device_memory(std::size_t bytes)
 
   // Avoid allocating all available memory:
   constexpr std::size_t padding = 16 * 1024 * 1024; // 16 MiB
-  if (free_bytes < (bytes + padding))
+  if (info.free < (bytes + padding))
   {
-#ifdef DEBUG_CHECKED_ALLOC_FAILURE
-    const double total_GiB     = static_cast<double>(total_bytes) / (1024 * 1024 * 1024);
-    const double free_GiB      = static_cast<double>(free_bytes) / (1024 * 1024 * 1024);
-    const double requested_GiB = static_cast<double>(bytes) / (1024 * 1024 * 1024);
-    const double padded_GiB    = static_cast<double>(bytes + padding) / (1024 * 1024 * 1024);
-
-    std::cerr
-      << "Total device mem:     " << total_GiB << " GiB\n" //
-      << "Free device mem:      " << free_GiB << " GiB\n" //
-      << "Requested device mem: " << requested_GiB << " GiB\n" //
-      << "Padded device mem:    " << padded_GiB << " GiB\n";
-#endif
+    if (get_debug_checked_allocs())
+    {
+      const double total_GiB     = static_cast<double>(info.total) / (1024 * 1024 * 1024);
+      const double free_GiB      = static_cast<double>(info.free) / (1024 * 1024 * 1024);
+      const double requested_GiB = static_cast<double>(bytes) / (1024 * 1024 * 1024);
+      const double padded_GiB    = static_cast<double>(bytes + padding) / (1024 * 1024 * 1024);
+
+      std::cerr << "Device memory allocation failed due to insufficient free device memory.\n";
+
+      if (info.override)
+      {
+        std::cerr
+          << "Available device memory has been limited (env var CCCL_DEVICE_MEMORY_LIMIT=" << get_device_memory_limit()
+          << ").\n";
+      }
+
+      std::cerr
+        << "Total device mem:     " << total_GiB << " GiB\n" //
+        << "Free device mem:      " << free_GiB << " GiB\n" //
+        << "Requested device mem: " << requested_GiB << " GiB\n" //
+        << "Padded device mem:    " << padded_GiB << " GiB\n";
+    }
+
     return cudaErrorMemoryAllocation;
   }
 
diff --git a/cub/test/c2h/utility.cuh b/cub/test/c2h/utility.cuh
index 2f4ac412ca..434f3cf51f 100644
--- a/cub/test/c2h/utility.cuh
+++ b/cub/test/c2h/utility.cuh
@@ -38,19 +38,6 @@
 namespace c2h
 {
 
-/**
- * Return a value of type `T0` with the same bitwise representation of `in`.
- * Types `To` and `From` must be the same size.
- */
-template <typename To, typename From>
-__host__ __device__ To bit_cast(const From& in)
-{
-  static_assert(sizeof(To) == sizeof(From), "Types must be same size.");
-  To out;
-  memcpy(&out, &in, sizeof(To));
-  return out;
-}
-
 // TODO(bgruber): duplicated version of thrust/testing/unittest/system.h
 inline std::string demangle(const char* name)
 {
diff --git a/cub/test/catch2_main.cuh b/cub/test/catch2_main.cuh
index fc08aa13eb..1d42355ce7 100644
--- a/cub/test/catch2_main.cuh
+++ b/cub/test/catch2_main.cuh
@@ -29,10 +29,10 @@
 
 #include <iostream>
 
-//! @file This file includes a custom Catch2 main function. When CMake is configured to build
-//!       each test as a separate executable, this header is included into each test. On the other
-//!       hand, when all the tests are compiled into a single executable, this header is excluded
-//!       from the tests and included into catch2_runner.cpp
+//! @file
+//! This file includes a custom Catch2 main function. When CMake is configured to build each test as a separate
+//! executable, this header is included into each test. On the other hand, when all the tests are compiled into a single
+//! executable, this header is excluded from the tests and included into catch2_runner.cpp
 
 #ifdef CUB_CONFIG_MAIN
 #  define CATCH_CONFIG_RUNNER
diff --git a/cub/test/catch2_radix_sort_helper.cuh b/cub/test/catch2_radix_sort_helper.cuh
index 758253203b..61b02fc6f1 100644
--- a/cub/test/catch2_radix_sort_helper.cuh
+++ b/cub/test/catch2_radix_sort_helper.cuh
@@ -39,6 +39,8 @@
 #include <thrust/scan.h>
 #include <thrust/sequence.h>
 
+#include <cuda/std/bit>
+
 #include <array>
 #include <climits>
 #include <cstdint>
@@ -199,7 +201,7 @@ c2h::host_vector<KeyT> get_striped_keys(const c2h::host_vector<KeyT>& h_keys, in
 
   for (std::size_t i = 0; i < h_keys.size(); i++)
   {
-    bit_ordered_t key = c2h::bit_cast<bit_ordered_t>(h_keys[i]);
+    bit_ordered_t key = ::cuda::std::bit_cast<bit_ordered_t>(h_keys[i]);
 
     _CCCL_IF_CONSTEXPR (traits_t::CATEGORY == cub::FLOATING_POINT)
     {
diff --git a/cub/test/catch2_runner.cpp b/cub/test/catch2_runner.cpp
index 53a19f7b6a..73f3f70d8a 100644
--- a/cub/test/catch2_runner.cpp
+++ b/cub/test/catch2_runner.cpp
@@ -25,8 +25,9 @@
  *
  ******************************************************************************/
 
-//! @file This file includes a custom Catch2 main function when CMake is configured to build
-//!       all tests into a single executable.
+//! @file
+//! This file includes a custom Catch2 main function when CMake is configured to build all tests into a single
+//! executable.
 
 #define CUB_CONFIG_MAIN
 #define CUB_EXCLUDE_CATCH2_HELPER_IMPL
diff --git a/cub/test/catch2_runner_helper.cu b/cub/test/catch2_runner_helper.cu
index d16e09f742..628f9525f7 100644
--- a/cub/test/catch2_runner_helper.cu
+++ b/cub/test/catch2_runner_helper.cu
@@ -25,9 +25,9 @@
  *
  ******************************************************************************/
 
-//! @file This file includes CUDA-specific utilities for custom Catch2 main function when CMake is
-//!       configured to build all tests into a single executable. In this case, we have to have
-//!       a CUDA target in the final Catch2 executable, otherwise CMake confuses linker options and
-//!       MSVC/RDC build fails.
+//! @file
+//! This file includes CUDA-specific utilities for custom Catch2 main function when CMake is configured to build all
+//! tests into a single executable. In this case, we have to have a CUDA target in the final Catch2 executable,
+//! otherwise CMake confuses linker options and MSVC/RDC build fails.
 
 #include "catch2_runner_helper.inl"
diff --git a/cub/test/catch2_runner_helper.inl b/cub/test/catch2_runner_helper.inl
index 3971760800..f8a2bfa2ab 100644
--- a/cub/test/catch2_runner_helper.inl
+++ b/cub/test/catch2_runner_helper.inl
@@ -27,10 +27,10 @@
 
 #pragma once
 
-//! @file This file includes implementation of CUDA-specific utilities for custom Catch2 main
-//!       When CMake is configured to include all the tests into a single executable, this file
-//!       is only included into catch2_runner_helper.cu. When CMake is configured to compile
-//!       each test as a separate binary, this file is included into each test.
+//! @file
+//! This file includes implementation of CUDA-specific utilities for custom Catch2 main When CMake is configured to
+//! include all the tests into a single executable, this file is only included into catch2_runner_helper.cu. When CMake
+//! is configured to compile each test as a separate binary, this file is included into each test.
 
 #include <iostream>
 
diff --git a/cub/test/catch2_test_block_load.cu b/cub/test/catch2_test_block_load.cu
index 39bccc50c5..43fd75698f 100644
--- a/cub/test/catch2_test_block_load.cu
+++ b/cub/test/catch2_test_block_load.cu
@@ -28,6 +28,7 @@
 #include <cub/block/block_load.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_allocator.cuh>
+#include <cub/util_arch.cuh>
 
 #include "catch2_test_helper.h"
 
@@ -113,7 +114,7 @@ void block_load(InputIteratorT input, OutputIteratorT output, int num_items)
   using input_t                       = cub::detail::value_t<InputIteratorT>;
   using block_load_t                  = cub::BlockLoad<input_t, ThreadsInBlock, ItemsPerThread, LoadAlgorithm>;
   using storage_t                     = typename block_load_t::TempStorage;
-  constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48;
+  constexpr bool sufficient_resources = sizeof(storage_t) <= cub::detail::max_smem_per_block;
 
   kernel<InputIteratorT, OutputIteratorT, ItemsPerThread, ThreadsInBlock, LoadAlgorithm>
     <<<1, ThreadsInBlock>>>(std::integral_constant<bool, sufficient_resources>{}, input, output, num_items);
diff --git a/cub/test/catch2_test_block_store.cu b/cub/test/catch2_test_block_store.cu
index f157a28ea0..566dd2e828 100644
--- a/cub/test/catch2_test_block_store.cu
+++ b/cub/test/catch2_test_block_store.cu
@@ -29,6 +29,7 @@
 #include <cub/iterator/cache_modified_output_iterator.cuh>
 #include <cub/iterator/discard_output_iterator.cuh>
 #include <cub/util_allocator.cuh>
+#include <cub/util_arch.cuh>
 
 #include "catch2_test_helper.h"
 
@@ -114,7 +115,7 @@ void block_store(InputIteratorT input, OutputIteratorT output, int num_items)
   using input_t                       = cub::detail::value_t<InputIteratorT>;
   using block_store_t                 = cub::BlockStore<input_t, ThreadsInBlock, ItemsPerThread, StoreAlgorithm>;
   using storage_t                     = typename block_store_t::TempStorage;
-  constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48;
+  constexpr bool sufficient_resources = sizeof(storage_t) <= cub::detail::max_smem_per_block;
 
   kernel<InputIteratorT, OutputIteratorT, ItemsPerThread, ThreadsInBlock, StoreAlgorithm>
     <<<1, ThreadsInBlock>>>(std::integral_constant<bool, sufficient_resources>{}, input, output, num_items);
diff --git a/cub/test/catch2_test_device_histogram.cu b/cub/test/catch2_test_device_histogram.cu
index 5d43ebbc05..c0e2ee1fa9 100644
--- a/cub/test/catch2_test_device_histogram.cu
+++ b/cub/test/catch2_test_device_histogram.cu
@@ -29,10 +29,9 @@
 #include <cub/device/device_histogram.cuh>
 #include <cub/iterator/counting_input_iterator.cuh>
 
-#include <cuda/std/__algorithm/copy.h>
-#include <cuda/std/__cccl/dialect.h>
-#include <cuda/std/__cccl/execution_space.h>
+#include <cuda/std/__algorithm_>
 #include <cuda/std/array>
+#include <cuda/std/bit>
 #include <cuda/std/type_traits>
 
 #include <algorithm>
@@ -213,7 +212,7 @@ struct bit_and_anything
   _CCCL_HOST_DEVICE auto operator()(const T& a, const T& b) const -> T
   {
     using U = typename cub::Traits<T>::UnsignedBits;
-    return c2h::bit_cast<T>(static_cast<U>(c2h::bit_cast<U>(a) & c2h::bit_cast<U>(b)));
+    return ::cuda::std::bit_cast<T>(static_cast<U>(::cuda::std::bit_cast<U>(a) & ::cuda::std::bit_cast<U>(b)));
   }
 };
 
diff --git a/cub/test/catch2_test_device_radix_sort_keys.cu b/cub/test/catch2_test_device_radix_sort_keys.cu
index 961361622d..24d60033e3 100644
--- a/cub/test/catch2_test_device_radix_sort_keys.cu
+++ b/cub/test/catch2_test_device_radix_sort_keys.cu
@@ -192,8 +192,8 @@ CUB_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sor
   using bits_t = typename cub::Traits<key_t>::UnsignedBits;
 
   constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT;
-  const key_t positive_zero      = c2h::bit_cast<key_t>(bits_t(0));
-  const key_t negative_zero      = c2h::bit_cast<key_t>(bits_t(1) << (num_bits - 1));
+  const key_t positive_zero      = ::cuda::std::bit_cast<key_t>(bits_t(0));
+  const key_t negative_zero      = ::cuda::std::bit_cast<key_t>(bits_t(1) << (num_bits - 1));
 
   constexpr std::size_t max_num_items = 1 << 18;
   const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu
index 1e9e08c911..bfd7c3e8a2 100644
--- a/cub/test/catch2_test_device_reduce.cu
+++ b/cub/test/catch2_test_device_reduce.cu
@@ -24,7 +24,6 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-
 #include "insert_nested_NVTX_range_guard.h"
 // above header needs to be included first
 
@@ -48,7 +47,7 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Max, device_max);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
-// %PARAM% TEST_TYPES types 0:1:2:3
+// %PARAM% TEST_TYPES types 0:1:2:3:4
 
 // List of types to test
 using custom_t =
@@ -72,9 +71,13 @@ type_pair<custom_t>
 #endif
 #if TEST_BF_T
 , type_pair<bfloat16_t> // testing bf16
-#endif
+
 >;
+#endif
 // clang-format on
+#elif TEST_TYPES == 4
+// DPX SIMD instructions
+using full_type_list = c2h::type_list<type_pair<std::uint16_t>, type_pair<std::int16_t>>;
 #endif
 
 /**
@@ -124,6 +127,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
   }
   auto d_in_it = thrust::raw_pointer_cast(in_items.data());
 
+#if TEST_TYPES != 4
   SECTION("reduce")
   {
     using op_t = cub::Sum;
@@ -132,7 +136,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
 
     // Prepare verification data
-    using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;
     output_t expected_result =
       static_cast<output_t>(compute_single_problem_reference(in_items, reduction_op, accum_t{}));
 
@@ -145,6 +149,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     // Verify result
     REQUIRE(expected_result == out_result[0]);
   }
+#endif // TEST_TYPES != 4
 
 // Skip DeviceReduce::Sum tests for extended floating-point types because of unbounded epsilon due
 // to pseudo associativity of the addition operation over floating point numbers
@@ -152,7 +157,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
   SECTION("sum")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;
 
     // Prepare verification data
     output_t expected_result = static_cast<output_t>(compute_single_problem_reference(in_items, op_t{}, accum_t{}));
@@ -197,6 +202,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     REQUIRE(expected_result == out_result[0]);
   }
 
+#if TEST_TYPES != 4
   SECTION("argmax")
   {
     // Prepare verification data
@@ -233,4 +239,5 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
     REQUIRE(expected_result[0] == gpu_value);
     REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key);
   }
+#endif
 }
diff --git a/cub/test/catch2_test_device_reduce_by_key.cu b/cub/test/catch2_test_device_reduce_by_key.cu
index 39f31d5e78..88c305fd36 100644
--- a/cub/test/catch2_test_device_reduce_by_key.cu
+++ b/cub/test/catch2_test_device_reduce_by_key.cu
@@ -116,7 +116,7 @@ CUB_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis
     auto reduction_op = unwrap_op(reference_extended_fp(d_values_it), op_t{});
 
     // Prepare verification data
-    using accum_t = cub::detail::accumulator_t<op_t, output_t, value_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
     c2h::host_vector<output_t> expected_result(num_segments);
     compute_segmented_problem_reference(in_values, segment_offsets, reduction_op, accum_t{}, expected_result.begin());
     c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
diff --git a/cub/test/catch2_test_device_reduce_by_key_iterators.cu b/cub/test/catch2_test_device_reduce_by_key_iterators.cu
index 3637813b5f..14b7fcde9f 100644
--- a/cub/test/catch2_test_device_reduce_by_key_iterators.cu
+++ b/cub/test/catch2_test_device_reduce_by_key_iterators.cu
@@ -90,7 +90,7 @@ CUB_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]"
   using op_t = cub::Sum;
 
   // Prepare verification data
-  using accum_t = cub::detail::accumulator_t<op_t, output_t, value_t>;
+  using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
   c2h::host_vector<output_t> expected_result(num_segments);
   compute_segmented_problem_reference(value_it, segment_offsets, op_t{}, accum_t{}, expected_result.begin());
   c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
diff --git a/cub/test/catch2_test_device_reduce_iterators.cu b/cub/test/catch2_test_device_reduce_iterators.cu
index 7c7f74ec63..ab1dabbbb1 100644
--- a/cub/test/catch2_test_device_reduce_iterators.cu
+++ b/cub/test/catch2_test_device_reduce_iterators.cu
@@ -104,7 +104,7 @@ CUB_TEST("Device reduce works with fancy input iterators", "[reduce][device]", i
   auto reduction_op = op_t{};
 
   // Prepare verification data
-  using accum_t            = cub::detail::accumulator_t<op_t, init_t, item_t>;
+  using accum_t            = ::cuda::std::__accumulator_t<op_t, item_t, init_t>;
   output_t expected_result = compute_single_problem_reference(in_it, in_it + num_items, reduction_op, accum_t{});
 
   // Run test
diff --git a/cub/test/catch2_test_device_scan.cu b/cub/test/catch2_test_device_scan.cu
index 49c9aac39c..736e217b0e 100644
--- a/cub/test/catch2_test_device_scan.cu
+++ b/cub/test/catch2_test_device_scan.cu
@@ -127,7 +127,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
   SECTION("inclusive sum")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Prepare verification data
     c2h::host_vector<input_t> host_items(in_items);
@@ -155,7 +155,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
   SECTION("exclusive sum")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Prepare verification data
     c2h::host_vector<input_t> host_items(in_items);
@@ -184,7 +184,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
   SECTION("inclusive scan")
   {
     using op_t    = cub::Min;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Prepare verification data
     c2h::host_vector<input_t> host_items(in_items);
@@ -213,7 +213,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
   SECTION("inclusive scan with init value")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Scan operator
     auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
@@ -248,7 +248,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
   SECTION("exclusive scan")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Scan operator
     auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
@@ -281,7 +281,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
   SECTION("exclusive scan with future-init value")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Scan operator
     auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
diff --git a/cub/test/catch2_test_device_scan.cuh b/cub/test/catch2_test_device_scan.cuh
index d3644e3387..dc5b7804e8 100644
--- a/cub/test/catch2_test_device_scan.cuh
+++ b/cub/test/catch2_test_device_scan.cuh
@@ -61,7 +61,7 @@ template <typename InputIt, typename OutputIt, typename InitT, typename BinaryOp
 void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt result, InitT init, BinaryOp op)
 {
   using value_t  = cub::detail::value_t<InputIt>;
-  using accum_t  = cub::detail::accumulator_t<BinaryOp, InitT, value_t>;
+  using accum_t  = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
   using output_t = cub::detail::value_t<OutputIt>;
   accum_t acc    = static_cast<accum_t>(init);
   for (; first != last; ++first)
@@ -75,7 +75,7 @@ template <typename InputIt, typename OutputIt, typename BinaryOp, typename InitT
 void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt result, BinaryOp op, InitT init)
 {
   using value_t  = cub::detail::value_t<InputIt>;
-  using accum_t  = cub::detail::accumulator_t<BinaryOp, InitT, value_t>;
+  using accum_t  = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
   using output_t = cub::detail::value_t<OutputIt>;
   accum_t acc    = static_cast<accum_t>(init);
   for (; first != last; ++first)
@@ -101,7 +101,7 @@ void compute_exclusive_scan_by_key_reference(
   std::size_t num_items)
 {
   using value_t  = cub::detail::value_t<ValueInItT>;
-  using accum_t  = cub::detail::accumulator_t<ScanOpT, InitT, value_t>;
+  using accum_t  = ::cuda::std::__accumulator_t<ScanOpT, value_t, InitT>;
   using output_t = cub::detail::value_t<ValuesOutItT>;
 
   if (num_items > 0)
@@ -152,7 +152,7 @@ void compute_inclusive_scan_by_key_reference(
   std::size_t num_items)
 {
   using value_t  = cub::detail::value_t<ValueInItT>;
-  using accum_t  = cub::detail::accumulator_t<ScanOpT, value_t, value_t>;
+  using accum_t  = ::cuda::std::__accumulator_t<ScanOpT, value_t, value_t>;
   using output_t = cub::detail::value_t<ValuesOutItT>;
 
   for (std::size_t i = 0; i < num_items;)
diff --git a/cub/test/catch2_test_device_scan_iterators.cu b/cub/test/catch2_test_device_scan_iterators.cu
index 576d0d3f74..a07397cc36 100644
--- a/cub/test/catch2_test_device_scan_iterators.cu
+++ b/cub/test/catch2_test_device_scan_iterators.cu
@@ -84,7 +84,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis
   SECTION("inclusive sum")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Prepare verification data
     c2h::host_vector<output_t> expected_result(num_items);
@@ -102,7 +102,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis
   SECTION("exclusive sum")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Prepare verification data
     c2h::host_vector<output_t> expected_result(num_items);
@@ -120,7 +120,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis
   SECTION("inclusive scan")
   {
     using op_t    = cub::Min;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Prepare verification data
     c2h::host_vector<output_t> expected_result(num_items);
@@ -139,7 +139,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis
   SECTION("exclusive scan")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Prepare verification data
     c2h::host_vector<output_t> expected_result(num_items);
@@ -157,7 +157,7 @@ CUB_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis
   SECTION("exclusive scan with future-init value")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
 
     // Prepare verification data
     accum_t init_value{};
diff --git a/cub/test/catch2_test_device_scan_large_offsets.cu b/cub/test/catch2_test_device_scan_large_offsets.cu
index 9d00d89e14..0c0854e21e 100644
--- a/cub/test/catch2_test_device_scan_large_offsets.cu
+++ b/cub/test/catch2_test_device_scan_large_offsets.cu
@@ -35,33 +35,12 @@
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 
-// TODO(elstehle) replace with DeviceScan interface once https://github.com/NVIDIA/cccl/issues/50 is addressed
-// Temporary wrapper that allows specializing the DeviceScan algorithm for different offset types
-template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename OffsetT>
-CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_scan_wrapper(
-  void* d_temp_storage,
-  size_t& temp_storage_bytes,
-  InputIteratorT d_in,
-  OutputIteratorT d_out,
-  ScanOpT scan_op,
-  InitValueT init_value,
-  OffsetT num_items,
-  cudaStream_t stream = 0)
-{
-  using init_value_t = cub::detail::InputValue<InitValueT>;
-  init_value_t init_value_wrapper{init_value};
-
-  return cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, init_value_t, OffsetT>::Dispatch(
-    d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value_wrapper, num_items, stream);
-}
-
-DECLARE_LAUNCH_WRAPPER(dispatch_scan_wrapper, dispatch_exclusive_scan);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_exclusive_scan);
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
 
-// TODO(elstehle) replace with actual offset types, once https://github.com/NVIDIA/cccl/issues/50 is addresed
 // List of offset types to be used for testing large number of items
-using offset_types = c2h::type_list<std::int64_t, std::uint64_t, std::int32_t, std::uint32_t>;
+using offset_types = c2h::type_list<std::uint32_t, std::uint64_t>;
 
 template <typename ItemT>
 struct expected_sum_op
@@ -106,12 +85,12 @@ try
   offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
   offset_t num_items_min =
     num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
-  // TODO(elstehle) remove single-item size, once https://github.com/NVIDIA/cccl/issues/50 is addresed
-  offset_t num_items =
-    GENERATE_COPY(values({num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1)}),
-                  take(2, random(num_items_min, num_items_max)));
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
 
-  // Prepare input
+  // Prepare input (generate a series of: 0, 1, 2, ..., <segment_size-1>,  0, 1, 2, ..., <segment_size-1>, 0, 1, ...)
   constexpr index_t segment_size = 1000;
   auto index_it                  = thrust::make_counting_iterator(index_t{});
   auto items_it                  = thrust::make_transform_iterator(index_it, mod_op<item_t>{segment_size});
@@ -120,8 +99,12 @@ try
   c2h::device_vector<item_t> d_items_out(num_items);
   auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data());
 
+  c2h::device_vector<item_t> d_initial_value(1);
+  d_initial_value[0]     = item_t{};
+  auto future_init_value = cub::FutureValue<item_t>(thrust::raw_pointer_cast(d_initial_value.data()));
+
   // Run test
-  dispatch_exclusive_scan(items_it, d_items_out_it, op_t{}, item_t{}, num_items);
+  device_exclusive_scan(items_it, d_items_out_it, op_t{}, future_init_value, num_items);
 
   // Ensure that we created the correct output
   auto expected_out_it =
diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu
index 770b85b019..5559e7e2e8 100644
--- a/cub/test/catch2_test_device_segmented_reduce.cu
+++ b/cub/test/catch2_test_device_segmented_reduce.cu
@@ -121,7 +121,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[segmented][reduce][
     auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
 
     // Prepare verification data
-    using accum_t = cub::detail::accumulator_t<op_t, output_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, output_t>;
     c2h::host_vector<output_t> expected_result(num_segments);
     compute_segmented_problem_reference(in_items, segment_offsets, reduction_op, accum_t{}, expected_result.begin());
 
@@ -142,7 +142,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[segmented][reduce][
   SECTION("sum")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, output_t, input_t>;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, output_t>;
 
     // Prepare verification data
     c2h::host_vector<output_t> expected_result(num_segments);
diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu
index 8ab495ddc5..a81559b91e 100644
--- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu
+++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu
@@ -93,7 +93,7 @@ CUB_TEST("Device segmented reduce works with fancy input iterators", "[reduce][d
   auto reduction_op = op_t{};
 
   // Prepare verification data
-  using accum_t = cub::detail::accumulator_t<op_t, init_t, item_t>;
+  using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, init_t>;
   c2h::host_vector<output_t> expected_result(num_segments);
   compute_segmented_problem_reference(in_it, segment_offsets, reduction_op, accum_t{}, expected_result.begin());
 
diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu
new file mode 100644
index 0000000000..50f253ef5c
--- /dev/null
+++ b/cub/test/catch2_test_device_transform.cu
@@ -0,0 +1,556 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_for.cuh>
+#include <cub/device/device_transform.cuh>
+
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/zip_function.h>
+
+#include <sstream>
+
+#include "c2h/custom_type.cuh"
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+#include "test/test_util_vec.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using cub::detail::transform::Algorithm;
+
+template <Algorithm Alg>
+struct policy_hub_for_alg
+{
+  struct max_policy : cub::ChainedPolicy<300, max_policy, max_policy>
+  {
+    static constexpr int min_bif         = 64 * 1024;
+    static constexpr Algorithm algorithm = Alg;
+    using algo_policy =
+      ::cuda::std::_If<Alg == Algorithm::fallback_for,
+                       cub::detail::transform::fallback_for_policy,
+                       cub::detail::transform::async_copy_policy_t<256>>;
+  };
+};
+
+template <Algorithm Alg,
+          typename Offset,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp>
+CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+  RandomAccessIteratorOut output,
+  Offset num_items,
+  TransformOp transform_op,
+  cudaStream_t stream = nullptr)
+{
+  if (d_temp_storage == nullptr)
+  {
+    temp_storage_bytes = 1;
+    return cudaSuccess;
+  }
+
+  constexpr bool RequiresStableAddress = false;
+  return cub::detail::transform::dispatch_t<RequiresStableAddress,
+                                            Offset,
+                                            ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                                            RandomAccessIteratorOut,
+                                            TransformOp,
+                                            policy_hub_for_alg<Alg>>{}
+    .dispatch(inputs, output, num_items, transform_op, stream);
+}
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::TransformStableArgumentAddresses, transform_many_stable);
+DECLARE_TMPL_LAUNCH_WRAPPER(transform_many_with_alg_entry_point,
+                            transform_many_with_alg,
+                            ESCAPE_LIST(Algorithm Alg, typename Offset),
+                            ESCAPE_LIST(Alg, Offset));
+
+using algorithms =
+  c2h::enum_type_list<Algorithm,
+                      Algorithm::fallback_for
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+                      ,
+                      Algorithm::ublkcp
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+                      >;
+
+using offset_types = c2h::type_list<std::int32_t, std::int64_t>;
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+#  define FILTER_UBLKCP                                \
+    if (alg == Algorithm::ublkcp && ptx_version < 900) \
+    {                                                  \
+      return;                                          \
+    }
+#else // _CUB_HAS_TRANSFORM_UBLKCP
+#  define FILTER_UBLKCP
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+#define FILTER_UNSUPPORTED_ALGS                                           \
+  int ptx_version = 0;                                                    \
+  REQUIRE(cub::PtxVersion(ptx_version) == cudaSuccess);                   \
+  _CCCL_DIAG_PUSH                                                         \
+  _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */ \
+  FILTER_UBLKCP                                                           \
+  _CCCL_DIAG_POP
+
+CUB_TEST("DeviceTransform::Transform BabelStream add",
+         "[device][device_transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t /*, int3, double3*/>,
+         offset_types,
+         algorithms)
+{
+  using type         = typename c2h::get<0, TestType>;
+  using offset_t     = typename c2h::get<1, TestType>;
+  constexpr auto alg = c2h::get<2, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+  const int num_items = GENERATE(0, 1, 15, 16, 17, 127, 128, 129, 4095, 4096, 4097); // edge cases around 16 and 128
+  CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg, num_items);
+
+  c2h::device_vector<type> a(num_items);
+  c2h::device_vector<type> b(num_items);
+  c2h::gen(CUB_SEED(1), a);
+  c2h::gen(CUB_SEED(1), b);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus<type>{});
+
+  // compute reference and verify
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+template <int Alignment>
+struct alignas(Alignment) overaligned_addable_t
+{
+  int value;
+
+  overaligned_addable_t() = default;
+
+  _CCCL_HOST_DEVICE overaligned_addable_t(int val)
+      : value{val}
+  {}
+
+  _CCCL_HOST_DEVICE static void check(const overaligned_addable_t& obj)
+  {
+    if (reinterpret_cast<uintptr_t>(&obj) % Alignment != 0)
+    {
+      printf("Error: object not aligned to %d: %p\n", Alignment, &obj);
+      ::cuda::std::terminate();
+    }
+  }
+
+  _CCCL_HOST_DEVICE friend auto operator==(const overaligned_addable_t& a, const overaligned_addable_t& b) -> bool
+  {
+    check(a);
+    check(b);
+    return a.value == b.value;
+  }
+
+  _CCCL_HOST_DEVICE friend auto
+  operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) -> overaligned_addable_t
+  {
+    check(a);
+    check(b);
+    return overaligned_addable_t{a.value + b.value};
+  }
+
+  _CCCL_HOST friend auto operator<<(std::ostream& os, const overaligned_addable_t& obj) -> std::ostream&
+  {
+    check(obj);
+    return os << "over{" << obj.value << "}";
+  }
+};
+
+using overaligned_types =
+  c2h::type_list<overaligned_addable_t<32>
+#ifndef _CCCL_COMPILER_MSVC // error C2719: [...] formal parameter with requested alignment of 256 won't be aligned
+                 ,
+                 overaligned_addable_t<256>
+#endif // _CCCL_COMPILER_MSVC
+                 >;
+
+// test with types exceeding the memcpy_async and bulk copy alignments (16 and 128 bytes respectively)
+CUB_TEST("DeviceTransform::Transform overaligned type", "[device][device_transform]", overaligned_types)
+{
+  using type = c2h::get<0, TestType>;
+  CAPTURE(c2h::demangle(typeid(type).name()));
+
+  const int num_items = GENERATE(0, 1, 100, 1000);
+  c2h::device_vector<int> a(num_items, 3); // put some integers at the front, so SMEM has to handle different alignments
+  c2h::device_vector<type> b(num_items, 4);
+
+  c2h::device_vector<type> result(num_items);
+  // we need raw pointers here to halfen the conversion sequence from device_reference<int> -> int -> type when calling
+  // plus(...), which is too long to compile
+  transform_many(::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())),
+                 result.begin(),
+                 num_items,
+                 ::cuda::std::plus<type>{});
+
+  REQUIRE(result == c2h::device_vector<type>(num_items, 7));
+}
+
+CUB_TEST("DeviceTransform::Transform huge type", "[device][device_transform]")
+{
+  using huge_t = c2h::custom_type_t<c2h::equal_comparable_t, c2h::accumulateable_t, c2h::huge_data<666>::type>;
+  static_assert(alignof(huge_t) == 8, "Need a large type with alignment < 16");
+  CAPTURE(c2h::demangle(typeid(huge_t).name()));
+
+  const int num_items = GENERATE(0, 1, 100, 1000);
+  c2h::device_vector<huge_t> a(num_items);
+  c2h::device_vector<huge_t> b(num_items);
+  c2h::gen(CUB_SEED(1), a);
+  c2h::gen(CUB_SEED(1), b);
+
+  c2h::device_vector<huge_t> result(num_items);
+  transform_many(::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus<huge_t>{});
+
+  c2h::host_vector<huge_t> a_h = a;
+  c2h::host_vector<huge_t> b_h = b;
+  c2h::host_vector<huge_t> reference_h(num_items);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus<huge_t>{});
+  REQUIRE(result == reference_h);
+}
+
+struct times_seven
+{
+  _CCCL_HOST_DEVICE auto operator()(unsigned char v) const -> char
+  {
+    return static_cast<unsigned char>(v * 7);
+  }
+};
+
+CUB_TEST("DeviceTransform::Transform with large input", "[device][device_transform]", algorithms)
+try
+{
+  using type         = unsigned char;
+  using offset_t     = cuda::std::int64_t;
+  constexpr auto alg = c2h::get<0, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+  CAPTURE(alg);
+
+  constexpr offset_t num_items = (offset_t{1} << 32) + 123456; // a few thread blocks beyond 4GiB
+  c2h::device_vector<type> input(num_items);
+  c2h::gen(CUB_SEED(1), input);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(input.begin()), result.begin(), num_items, times_seven{});
+
+  // compute reference and verify
+  c2h::host_vector<type> input_h = input;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(input_h.begin(), input_h.end(), reference_h.begin(), times_seven{});
+  REQUIRE((reference_h == result));
+}
+catch (const std::bad_alloc&)
+{
+  // allocation failure is not a test failure, so we can run tests on smaller GPUs
+}
+
+template <typename T>
+struct nstream_kernel
+{
+  static constexpr T scalar = 42;
+
+  _CCCL_HOST_DEVICE T operator()(const T& ai, const T& bi, const T& ci) const
+  {
+    return ai + bi + scalar * ci;
+  }
+};
+
+// overwrites one input stream
+CUB_TEST("DeviceTransform::Transform BabelStream nstream",
+         "[device][device_transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>,
+         offset_types,
+         algorithms)
+{
+  using type         = typename c2h::get<0, TestType>;
+  using offset_t     = typename c2h::get<1, TestType>;
+  constexpr auto alg = c2h::get<2, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+  CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg);
+
+  const int num_items = GENERATE(0, 1, 100, 1000, 10000);
+  c2h::device_vector<type> a(num_items);
+  c2h::device_vector<type> b(num_items);
+  c2h::device_vector<type> c(num_items);
+  c2h::gen(CUB_SEED(1), a, type{10}, type{100});
+  c2h::gen(CUB_SEED(1), b, type{10}, type{100});
+  c2h::gen(CUB_SEED(1), c, type{10}, type{100});
+
+  // copy to host before changing
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> c_h = c;
+
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin()), a.begin(), num_items, nstream_kernel<type>{});
+
+  // compute reference and verify
+  auto z = thrust::make_zip_iterator(a_h.begin(), b_h.begin(), c_h.begin());
+  std::transform(z, z + num_items, a_h.begin(), thrust::make_zip_function(nstream_kernel<type>{}));
+  REQUIRE(a_h == a);
+}
+
+struct sum_five
+{
+  __device__ auto operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double
+  {
+    return a + b + c + d + e;
+  }
+};
+
+CUB_TEST("DeviceTransform::Transform add five streams", "[device][device_transform]", algorithms)
+{
+  using offset_t     = int;
+  constexpr auto alg = c2h::get<0, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+
+  constexpr int num_items = 100;
+  c2h::device_vector<std::int8_t> a(num_items, 1);
+  c2h::device_vector<std::int16_t> b(num_items, 2);
+  c2h::device_vector<std::int32_t> c(num_items, 3);
+  c2h::device_vector<std::int64_t> d(num_items, 4);
+  c2h::device_vector<float> e(num_items, 5);
+
+  c2h::device_vector<double> result(num_items);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()),
+    result.begin(),
+    num_items,
+    sum_five{});
+
+  // compute reference and verify
+  c2h::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
+  REQUIRE(reference == result);
+}
+
+struct give_me_five
+{
+  __device__ auto operator()() const -> int
+  {
+    return 5;
+  }
+};
+
+CUB_TEST("DeviceTransform::Transform no streams", "[device][device_transform]")
+{
+  constexpr int num_items = 100;
+  c2h::device_vector<int> result(num_items);
+  transform_many(::cuda::std::tuple<>{}, result.begin(), num_items, give_me_five{});
+
+  // compute reference and verify
+  c2h::device_vector<int> reference(num_items, 5);
+  REQUIRE(reference == result);
+}
+
+CUB_TEST("DeviceTransform::Transform fancy input iterator types", "[device][device_transform]")
+{
+  using type = int;
+
+  constexpr int num_items = 100;
+  thrust::counting_iterator<type> a{0};
+  thrust::counting_iterator<type> b{10};
+
+  c2h::device_vector<type> result(num_items);
+  transform_many(::cuda::std::make_tuple(a, b), result.begin(), num_items, ::cuda::std::plus<type>{});
+
+  // compute reference and verify
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a, a + num_items, b, reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+CUB_TEST("DeviceTransform::Transform fancy output iterator type", "[device][device_transform]", algorithms)
+{
+  using type         = int;
+  using offset_t     = int;
+  constexpr auto alg = c2h::get<0, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+
+  constexpr int num_items = 100;
+  c2h::device_vector<type> a(num_items, 13);
+  c2h::device_vector<type> b(num_items, 35);
+  c2h::device_vector<type> result(num_items);
+
+  using thrust::placeholders::_1;
+  auto out = thrust::make_transform_output_iterator(result.begin(), _1 + 4);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin()), out, num_items, ::cuda::std::plus<type>{});
+  REQUIRE(result == c2h::device_vector<type>(num_items, (13 + 35) + 4));
+}
+
+CUB_TEST("DeviceTransform::Transform mixed input iterator types", "[device][device_transform]")
+{
+  using type = int;
+
+  constexpr int num_items = 100;
+  thrust::counting_iterator<type> a{0};
+  c2h::device_vector<type> b(num_items, 10);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many(::cuda::std::make_tuple(a, b.begin()), result.begin(), num_items, ::cuda::std::plus<type>{});
+
+  // compute reference and verify
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a, a + num_items, b_h.begin(), reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+struct plus_needs_stable_address
+{
+  int* a;
+  int* b;
+
+  _CCCL_HOST_DEVICE int operator()(const int& v) const
+  {
+    const auto i = &v - a;
+    return v + b[i];
+  }
+};
+
+CUB_TEST("DeviceTransform::Transform address stability", "[device][device_transform]")
+{
+  using type = int;
+
+  constexpr int num_items = 100;
+  c2h::device_vector<type> a(num_items);
+  c2h::device_vector<type> b(num_items);
+  thrust::sequence(a.begin(), a.end());
+  thrust::sequence(b.begin(), b.end(), 42);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many_stable(
+    ::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data())),
+    result.begin(),
+    num_items,
+    plus_needs_stable_address{thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())});
+
+  // compute reference and verify
+  c2h::device_vector<type> a_h = a;
+  c2h::device_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+// Non-trivially-copyable/relocatable type which cannot be copied using std::memcpy or cudaMemcpy
+struct non_trivial
+{
+  int data;
+
+  non_trivial() = default;
+
+  _CCCL_HOST_DEVICE explicit non_trivial(int data)
+      : data(data)
+  {}
+
+  _CCCL_HOST_DEVICE non_trivial(const non_trivial& nt)
+      : data(nt.data)
+  {}
+
+  _CCCL_HOST_DEVICE auto operator=(const non_trivial& nt) -> non_trivial&
+  {
+    data = nt.data;
+    return *this;
+  }
+
+  _CCCL_HOST_DEVICE auto operator-() const -> non_trivial
+  {
+    return non_trivial{-data};
+  }
+
+  friend _CCCL_HOST_DEVICE auto operator==(non_trivial a, non_trivial b) -> bool
+  {
+    return a.data == b.data;
+  }
+};
+static_assert(!::cuda::std::is_trivially_copyable<non_trivial>::value, ""); // as required by the standard
+static_assert(!thrust::is_trivially_relocatable<non_trivial>::value, ""); // CUB uses this check internally
+
+// Note(bgruber): I gave up on writing a test that checks whether the copy ctor/assignment operator is actually called
+// (e.g. by tracking/counting invocations of those), since C++ allows (but not guarantees) elision of these operations.
+// Also thrust algorithms perform a lot of copies in-between, so the test needs to use only raw allocations and
+// iteration for setup and checking.
+CUB_TEST("DeviceTransform::Transform not trivially relocatable", "[device][device_transform]")
+{
+  constexpr int num_items = 100;
+  c2h::device_vector<non_trivial> input(num_items, non_trivial{42});
+  c2h::device_vector<non_trivial> result(num_items);
+  transform_many(
+    ::cuda::std::make_tuple(thrust::raw_pointer_cast(input.data())), result.begin(), num_items, ::cuda::std::negate<>{});
+
+  const auto reference = c2h::device_vector<non_trivial>(num_items, non_trivial{-42});
+  REQUIRE((reference == result));
+}
+
+CUB_TEST("DeviceTransform::Transform buffer start alignment",
+         "[device][device_transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, float, double>)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 1000;
+  const int offset        = GENERATE(1, 2, 4, 8, 16, 32, 64, 128); // global memory is always at least 256 byte aligned
+  CAPTURE(c2h::demangle(typeid(type).name()), offset);
+  c2h::device_vector<type> input(num_items);
+  thrust::sequence(input.begin(), input.end());
+  c2h::device_vector<type> result(num_items);
+  using thrust::placeholders::_1;
+  transform_many(::cuda::std::make_tuple(input.begin() + offset),
+                 result.begin() + offset,
+                 num_items - offset,
+                 _1 * 10); // FIXME(bgruber): does not work on negative
+
+  c2h::device_vector<type> reference(num_items);
+  thrust::tabulate(reference.begin() + offset, reference.end(), (_1 + offset) * 10);
+  REQUIRE(reference == result);
+}
+
+namespace Catch
+{
+template <typename T>
+struct StringMaker<cub::detail::transform::aligned_base_ptr<T>>
+{
+  static auto convert(cub::detail::transform::aligned_base_ptr<T> abp) -> std::string
+  {
+    std::stringstream ss;
+    ss << "{ptr: " << abp.ptr << ", head_padding: " << abp.head_padding << "}";
+    return ss.str();
+  }
+};
+} // namespace Catch
+
+// TODO(bgruber): rewrite this example using int3
+CUB_TEST("DeviceTransform::Transform aligned_base_ptr", "[device][device_transform]")
+{
+  alignas(128) int arr[256];
+  using namespace cub::detail::transform;
+  CHECK(make_aligned_base_ptr(&arr[0], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 0});
+  CHECK(make_aligned_base_ptr(&arr[1], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 4});
+  CHECK(make_aligned_base_ptr(&arr[5], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 20});
+  CHECK(make_aligned_base_ptr(&arr[31], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 124});
+  CHECK(make_aligned_base_ptr(&arr[32], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[32]), 0});
+  CHECK(make_aligned_base_ptr(&arr[33], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[32]), 4});
+  CHECK(make_aligned_base_ptr(&arr[127], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[96]), 124});
+  CHECK(make_aligned_base_ptr(&arr[128], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[128]), 0});
+  CHECK(make_aligned_base_ptr(&arr[129], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[128]), 4});
+}
diff --git a/cub/test/catch2_test_device_transform_api.cu b/cub/test/catch2_test_device_transform_api.cu
new file mode 100644
index 0000000000..46388ed6b2
--- /dev/null
+++ b/cub/test/catch2_test_device_transform_api.cu
@@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cub/device/device_transform.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "catch2_test_helper.h"
+
+// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
+void test_transform_api()
+{
+  // example-begin transform-many
+  constexpr auto num_items = 4;
+  auto input1              = thrust::device_vector<int>{0, -2, 5, 3};
+  auto input2              = thrust::device_vector<float>{5.2f, 3.1f, -1.1f, 3.0f};
+  auto input3              = thrust::counting_iterator<int>{100};
+  auto op                  = [] __device__(int a, float b, int c) {
+    return (a + b) * c;
+  };
+
+  auto result = thrust::device_vector<int>(num_items);
+  cub::DeviceTransform::Transform(
+    ::cuda::std::make_tuple(input1.begin(), input2.begin(), input3), result.begin(), num_items, op);
+
+  const auto expected = thrust::host_vector<float>{520, 111, 397, 618};
+  // example-end transform-many
+  CHECK(result == expected);
+}
+
+CUB_TEST("DeviceTransform::Transform API example", "[device][device_transform]")
+{
+  test_transform_api();
+}
+
+// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
+void test_transform_stable_api()
+{
+  // example-begin transform-many-stable
+  constexpr auto num_items = 4;
+  auto input1              = thrust::device_vector<int>{0, -2, 5, 3};
+  auto input2              = thrust::device_vector<int>{52, 31, -11, 30};
+
+  auto* input1_ptr = thrust::raw_pointer_cast(input1.data());
+  auto* input2_ptr = thrust::raw_pointer_cast(input2.data());
+
+  auto op = [input1_ptr, input2_ptr] __device__(const int& a) -> int {
+    const auto i = &a - input1_ptr; // we depend on the address of a
+    return a + input2_ptr[i];
+  };
+
+  auto result = thrust::device_vector<int>(num_items);
+  cub::DeviceTransform::TransformStableArgumentAddresses(
+    ::cuda::std::make_tuple(input1_ptr), result.begin(), num_items, op);
+
+  const auto expected = thrust::host_vector<float>{52, 29, -6, 33};
+  // example-end transform-many-stable
+  CHECK(result == expected);
+}
+
+CUB_TEST("DeviceTransform::TransformStableArgumentAddresses API example", "[device][device_transform]")
+{
+  test_transform_stable_api();
+}
diff --git a/cub/test/catch2_test_helper.h b/cub/test/catch2_test_helper.h
index e62b98adf1..7689c416f3 100644
--- a/cub/test/catch2_test_helper.h
+++ b/cub/test/catch2_test_helper.h
@@ -30,6 +30,7 @@
 #include <cub/util_compiler.cuh>
 
 #include <cstdint>
+#include <cstdlib>
 #include <tuple>
 #include <type_traits>
 
@@ -41,6 +42,7 @@
 _CCCL_NV_DIAG_SUPPRESS(177) // catch2 may contain unused variableds
 #endif // nvcc-11
 
+#include <cuda/std/bit>
 #include <cuda/std/cmath>
 #include <cuda/std/utility>
 
@@ -133,8 +135,8 @@ struct bitwise_equal
   bool operator()(const T& a, const T& b) const
   {
     using bits_t  = typename cub::Traits<T>::UnsignedBits;
-    bits_t a_bits = c2h::bit_cast<bits_t>(a);
-    bits_t b_bits = c2h::bit_cast<bits_t>(b);
+    bits_t a_bits = ::cuda::std::bit_cast<bits_t>(a);
+    bits_t b_bits = ::cuda::std::bit_cast<bits_t>(b);
     return a_bits == b_bits;
   }
 };
@@ -250,10 +252,22 @@ struct Catch::StringMaker<cudaError>
 
 #define CUB_TEST_STR(a) #a
 
+namespace detail
+{
+inline std::size_t adjust_seed_count(std::size_t requested)
+{
+  // Setting this environment variable forces a fixed number of seeds to be generated, regardless of the requested
+  // count. Set to 1 to reduce redundant, expensive testing when using sanitizers, etc.
+  static const char* override_str = std::getenv("CCCL_SEED_COUNT_OVERRIDE");
+  static int override             = override_str ? std::atoi(override_str) : 0;
+  return override_str ? override : requested;
+}
+} // namespace detail
+
 #define CUB_SEED(N)                                                                                                    \
   c2h::seed_t                                                                                                          \
   {                                                                                                                    \
     GENERATE_COPY(take(                                                                                                \
-      N,                                                                                                               \
+      detail::adjust_seed_count(N),                                                                                    \
       random(std::numeric_limits<unsigned long long int>::min(), std::numeric_limits<unsigned long long int>::max()))) \
   }
diff --git a/cub/test/catch2_test_launch_helper.h b/cub/test/catch2_test_launch_helper.h
index 311fea93b1..4add1d15d1 100644
--- a/cub/test/catch2_test_launch_helper.h
+++ b/cub/test/catch2_test_launch_helper.h
@@ -31,7 +31,8 @@
 
 #include "catch2_test_helper.h"
 
-//! @file This file contains utilities for device-scope API tests
+//! @file
+//! This file contains utilities for device-scope API tests
 //!
 //! Device-scope API in CUB can be launched from the host or device side.
 //! Utilities in this file facilitate testing in both cases.
@@ -73,7 +74,7 @@
 //! Consult with `test/catch2_test_cdp_wrapper.cu` for more usage examples.
 
 #if !defined(TEST_LAUNCH)
-#  error Test file should contain %PARAM% TEST_LAUNCH lid 0:1
+#  error Test file should contain %PARAM% TEST_LAUNCH lid 0:1:2
 #endif
 
 #define DECLARE_INVOCABLE(API, WRAPPED_API_NAME, TMPL_HEAD_OPT, TMPL_ARGS_OPT)                  \
diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu
index 466c3fa978..8dddb38c57 100644
--- a/cub/test/catch2_test_nvrtc.cu
+++ b/cub/test/catch2_test_nvrtc.cu
@@ -55,6 +55,7 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
     "#include <cub/block/block_reduce.cuh>                                                       \n"
     "#include <cub/block/block_scan.cuh>                                                         \n"
     "#include <cub/device/dispatch/kernels/reduce.cuh>                                           \n"
+    "#include <cub/device/dispatch/kernels/for_each.cuh>                                         \n"
     "                                                                                            \n"
     "extern \"C\" __global__ void kernel(int *ptr, int *errors)                                  \n"
     "{                                                                                           \n"
@@ -225,10 +226,11 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
   int ptx_version{};
   cub::PtxVersion(ptx_version);
   const std::string arch = std::string("-arch=sm_") + std::to_string(ptx_version / 10);
+  const std::string std  = std::string("-std=c++") + std::to_string(_CCCL_STD_VER - 2000);
 
-  constexpr int num_includes         = 5;
+  constexpr int num_includes         = 6;
   const char* includes[num_includes] = {
-    NVRTC_CUB_PATH, NVRTC_THRUST_PATH, NVRTC_LIBCUDACXX_PATH, NVRTC_CTK_PATH, arch.c_str()};
+    NVRTC_CUB_PATH, NVRTC_THRUST_PATH, NVRTC_LIBCUDACXX_PATH, NVRTC_CTK_PATH, arch.c_str(), std.c_str()};
 
   std::size_t log_size{};
   nvrtcResult compile_result = nvrtcCompileProgram(prog, num_includes, includes);
diff --git a/cub/test/test_block_radix_rank.cu b/cub/test/test_block_radix_rank.cu
index 6d36378882..8c1df1a80c 100644
--- a/cub/test/test_block_radix_rank.cu
+++ b/cub/test/test_block_radix_rank.cu
@@ -34,6 +34,7 @@
 #include <cub/block/block_store.cuh>
 #include <cub/block/radix_rank_sort_operations.cuh>
 #include <cub/util_allocator.cuh>
+#include <cub/util_vsmem.cuh>
 
 #include <algorithm>
 #include <iostream>
@@ -240,7 +241,7 @@ void Test()
     cub::detail::block_radix_rank_t<RankAlgorithm, BlockThreads, RadixBits, Descending, ScanAlgorithm>;
   using storage_t = typename block_radix_rank::TempStorage;
 
-  cub::Int2Type<(sizeof(storage_t) <= 48 * 1024)> fits_smem_capacity;
+  cub::Int2Type<(sizeof(storage_t) <= cub::detail::max_smem_per_block)> fits_smem_capacity;
 
   TestValid<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>(fits_smem_capacity);
 }
diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake
index 53cf7b8af4..0b2c0a2737 100644
--- a/cudax/cmake/cudaxBuildCompilerTargets.cmake
+++ b/cudax/cmake/cudaxBuildCompilerTargets.cmake
@@ -9,7 +9,6 @@
 include("${cudax_SOURCE_DIR}/cmake/AppendOptionIfAvailable.cmake")
 
 function(cudax_build_compiler_targets)
-  set(cxx_compile_definitions LIBCUDACXX_ENABLE_EXCEPTIONS)
   set(cxx_compile_options)
   set(cuda_compile_options)
 
@@ -66,6 +65,8 @@ function(cudax_build_compiler_targets)
 
     # GCC 7.3 complains about name mangling changes due to `noexcept`
     append_option_if_available("-Wno-noexcept-type" cxx_compile_options)
+
+    append_option_if_available("-Wmissing-field-initializers" cxx_compile_options)
   endif()
 
   if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake
index 29a3bd58ca..824b1a4fda 100644
--- a/cudax/cmake/cudaxHeaderTesting.cmake
+++ b/cudax/cmake/cudaxHeaderTesting.cmake
@@ -14,6 +14,12 @@ file(GLOB_RECURSE headers
   "${cudax_SOURCE_DIR}/include/*.h"
 )
 
+# The following internal headers are not required to compile independently:
+list(REMOVE_ITEM headers
+  "cuda/experimental/__async/prologue.cuh"
+  "cuda/experimental/__async/epilogue.cuh"
+)
+
 set(headertest_srcs)
 foreach (header IN LISTS headers)
   set(headertest_src "headers/${header}.cu")
diff --git a/cudax/cmake/header_test.in.cu b/cudax/cmake/header_test.in.cu
index 771ca319db..fd2df1987d 100644
--- a/cudax/cmake/header_test.in.cu
+++ b/cudax/cmake/header_test.in.cu
@@ -34,7 +34,9 @@
 #define I CUDAX_MACRO_CHECK('I', complex.h)
 
 // windows.h conflicts
-#define small CUDAX_MACRO_CHECK('small', windows.h)
+// @eniebler 2024-08-30: This test is disabled because it causes build
+// failures in some configurations.
+// #define small CUDAX_MACRO_CHECK('small', windows.h)
 // We can't enable these checks without breaking some builds -- some standard
 // library implementations unconditionally `#undef` these macros, which then
 // causes random failures later.
diff --git a/cudax/include/cuda/experimental/__async/async.cuh b/cudax/include/cuda/experimental/__async/async.cuh
new file mode 100644
index 0000000000..ed53717bca
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/async.cuh
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_ASYNC
+#define __CUDAX_ASYNC_DETAIL_ASYNC
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// Include this first
+#include <cuda/experimental/__async/config.cuh>
+
+// Include the other implementation headers:
+#include <cuda/experimental/__async/basic_sender.cuh>
+#include <cuda/experimental/__async/conditional.cuh>
+#include <cuda/experimental/__async/continue_on.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/just.cuh>
+#include <cuda/experimental/__async/just_from.cuh>
+#include <cuda/experimental/__async/let_value.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/read_env.cuh>
+#include <cuda/experimental/__async/run_loop.cuh>
+#include <cuda/experimental/__async/sequence.cuh>
+#include <cuda/experimental/__async/start_detached.cuh>
+#include <cuda/experimental/__async/start_on.cuh>
+#include <cuda/experimental/__async/stop_token.cuh>
+#include <cuda/experimental/__async/sync_wait.cuh>
+#include <cuda/experimental/__async/then.cuh>
+#include <cuda/experimental/__async/thread_context.cuh>
+#include <cuda/experimental/__async/when_all.cuh>
+#include <cuda/experimental/__async/write_env.cuh>
+
+#endif // __CUDAX_ASYNC_DETAIL_ASYNC
diff --git a/cudax/include/cuda/experimental/__async/basic_sender.cuh b/cudax/include/cuda/experimental/__async/basic_sender.cuh
new file mode 100644
index 0000000000..5730078ecc
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/basic_sender.cuh
@@ -0,0 +1,255 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_BASIC_SENDER
+#define __CUDAX_ASYNC_DETAIL_BASIC_SENDER
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Data, class _Rcvr>
+struct __state
+{
+  _Data __data_;
+  _Rcvr __receiver_;
+};
+
+struct receiver_defaults
+{
+  using receiver_concept = __async::receiver_t;
+
+  template <class _Rcvr, class... _Args>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto set_value(__ignore, _Rcvr& __rcvr, _Args&&... __args) noexcept
+    -> __async::completion_signatures<__async::set_value_t(_Args...)>
+  {
+    __async::set_value(static_cast<_Rcvr&&>(__rcvr), static_cast<_Args&&>(__args)...);
+    return {};
+  }
+
+  template <class _Rcvr, class _Error>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto set_error(__ignore, _Rcvr& __rcvr, _Error&& __error) noexcept
+    -> __async::completion_signatures<__async::set_error_t(_Error)>
+  {
+    __async::set_error(static_cast<_Rcvr&&>(__rcvr), static_cast<_Error&&>(__error));
+    return {};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto
+  set_stopped(__ignore, _Rcvr& __rcvr) noexcept -> __async::completion_signatures<__async::set_stopped_t()>
+  {
+    __async::set_stopped(static_cast<_Rcvr&&>(__rcvr));
+    return {};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static decltype(auto) get_env(__ignore, const _Rcvr& __rcvr) noexcept
+  {
+    return __async::get_env(__rcvr);
+  }
+};
+
+template <class _Data, class _Rcvr>
+struct basic_receiver
+{
+  using receiver_concept = __async::receiver_t;
+  using __rcvr_t         = typename _Data::receiver_tag;
+  __state<_Data, _Rcvr>& __state_;
+
+  template <class... _Args>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_Args&&... __args) noexcept
+  {
+    __rcvr_t::set_value(__state_.__data_, __state_.__receiver_, (_Args&&) __args...);
+  }
+
+  template <class _Error>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept
+  {
+    __rcvr_t::set_error(__state_.__data_, __state_.__receiver_, (_Error&&) __error);
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_stopped() noexcept
+  {
+    __rcvr_t::set_stopped(__state_.__data_, __state_.__receiver_);
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE decltype(auto) get_env() const noexcept
+  {
+    return __rcvr_t::get_env(__state_.__data_, __state_.__receiver_);
+  }
+};
+
+template <class _Rcvr>
+_CCCL_INLINE_VAR constexpr bool has_no_environment = _CUDA_VSTD::is_same_v<_Rcvr, receiver_archetype>;
+
+template <bool _HasStopped, class _Data, class _Rcvr>
+struct __mk_completions
+{
+  using __rcvr_t = typename _Data::receiver_tag;
+
+  template <class... _Args>
+  using __set_value_t =
+    decltype(+*__rcvr_t::set_value(__declval<_Data&>(), __declval<receiver_archetype&>(), __declval<_Args>()...));
+
+  template <class _Error>
+  using __set_error_t =
+    decltype(+*__rcvr_t::set_error(__declval<_Data&>(), __declval<receiver_archetype&>(), __declval<_Error>()));
+
+  using __set_stopped_t = __async::completion_signatures<>;
+};
+
+template <class _Data, class _Rcvr>
+struct __mk_completions<true, _Data, _Rcvr> : __mk_completions<false, _Data, _Rcvr>
+{
+  using __rcvr_t = typename _Data::receiver_tag;
+
+  using __set_stopped_t = decltype(+*__rcvr_t::set_stopped(__declval<_Data&>(), __declval<receiver_archetype&>()));
+};
+
+template <class...>
+using __ignore_value_signature = __async::completion_signatures<>;
+
+template <class>
+using __ignore_error_signature = __async::completion_signatures<>;
+
+template <class _Completions>
+constexpr bool __has_stopped =
+  !_CUDA_VSTD::is_same_v<__async::completion_signatures<>,
+                         __async::transform_completion_signatures<_Completions,
+                                                                  __async::completion_signatures<>,
+                                                                  __ignore_value_signature,
+                                                                  __ignore_error_signature>>;
+
+template <bool _PotentiallyThrowing, class _Rcvr>
+void set_current_exception_if([[maybe_unused]] _Rcvr& __rcvr) noexcept
+{
+  if constexpr (_PotentiallyThrowing)
+  {
+    __async::set_error(static_cast<_Rcvr&&>(__rcvr), ::std::current_exception());
+  }
+}
+
+// A generic type that holds the data for an async operation, and
+// that provides a `start` method for enqueuing the work.
+template <class _Sndr, class _Data, class _Rcvr>
+struct __basic_opstate
+{
+  using __rcvr_t        = basic_receiver<_Data, _Rcvr>;
+  using __completions_t = completion_signatures_of_t<_Sndr, __rcvr_t>;
+  using __traits_t      = __mk_completions<__has_stopped<__completions_t>, _Data, _Rcvr>;
+
+  using completion_signatures = //
+    transform_completion_signatures<__completions_t,
+                                    // TODO: add set_error_t(exception_ptr) if constructing
+                                    // the state or connecting the sender is potentially throwing.
+                                    __async::completion_signatures<>,
+                                    __traits_t::template __set_value_t,
+                                    __traits_t::template __set_error_t,
+                                    typename __traits_t::__set_stopped_t>;
+
+  _CCCL_HOST_DEVICE __basic_opstate(_Sndr&& __sndr, _Data __data, _Rcvr __rcvr)
+      : __state_{static_cast<_Data&&>(__data), static_cast<_Rcvr&&>(__rcvr)}
+      , __op_(__async::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{__state_}))
+  {}
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void start() noexcept
+  {
+    __async::start(__op_);
+  }
+
+  __state<_Data, _Rcvr> __state_;
+  __async::connect_result_t<_Sndr, __rcvr_t> __op_;
+};
+
+template <class _Sndr, class _Rcvr>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto __make_opstate(_Sndr __sndr, _Rcvr __rcvr)
+{
+  auto [__tag, __data, __child] = static_cast<_Sndr&&>(__sndr);
+  using __data_t                = decltype(__data);
+  using __child_t               = decltype(__child);
+  (void) __tag;
+  return __basic_opstate(
+    static_cast<__child_t&&>(__child), static_cast<__data_t&&>(__data), static_cast<_Rcvr&&>(__rcvr));
+}
+
+template <class _Data, class... _Sndrs>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto
+__get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept -> decltype(__data.get_attrs(__sndrs...))
+{
+  return __data.get_attrs(__sndrs...);
+}
+
+template <class _Data, class... _Sndrs>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto
+__get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept -> decltype(__async::get_env(__sndrs...))
+{
+  return __async::get_env(__sndrs...);
+}
+
+template <class _Data, class... _Sndrs>
+struct basic_sender;
+
+template <class _Data, class _Sndr>
+struct basic_sender<_Data, _Sndr>
+{
+  using sender_concept = __async::sender_t;
+  using __tag_t        = typename _Data::sender_tag;
+  using __rcvr_t       = typename _Data::receiver_tag;
+
+  _CCCL_NO_UNIQUE_ADDRESS __tag_t __tag_;
+  _Data __data_;
+  _Sndr __sndr_;
+
+  // Connect the sender to the receiver (the continuation) and
+  // return the state_type object for this operation.
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto connect(_Rcvr __rcvr) &&
+  {
+    return __make_opstate(static_cast<basic_sender&&>(*this), static_cast<_Rcvr&&>(__rcvr));
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto connect(_Rcvr __rcvr) const&
+  {
+    return __make_opstate(*this, static_cast<_Rcvr&&>(__rcvr));
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE decltype(auto) get_env() const noexcept
+  {
+    return __async::__get_attrs(0, __data_, __sndr_);
+  }
+};
+
+template <class _Data, class... _Sndrs>
+basic_sender(__ignore, _Data, _Sndrs...) -> basic_sender<_Data, _Sndrs...>;
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/completion_signatures.cuh b/cudax/include/cuda/experimental/__async/completion_signatures.cuh
new file mode 100644
index 0000000000..c4edf4b618
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/completion_signatures.cuh
@@ -0,0 +1,336 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_COMPLETION_SIGNATURES
+#define __CUDAX_ASYNC_DETAIL_COMPLETION_SIGNATURES
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// A typelist for completion signatures
+template <class... _Ts>
+struct completion_signatures
+{};
+
+// A metafunction to determine if a type is a completion signature
+template <class>
+_CCCL_INLINE_VAR constexpr bool __is_valid_signature = false;
+
+template <class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __is_valid_signature<set_value_t(_Ts...)> = true;
+
+template <class _Error>
+_CCCL_INLINE_VAR constexpr bool __is_valid_signature<set_error_t(_Error)> = true;
+
+template <>
+_CCCL_INLINE_VAR constexpr bool __is_valid_signature<set_stopped_t()> = true;
+
+// The implementation of transform_completion_signatures starts here
+template <class _Sig, template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+extern __undefined<_Sig> __transform_sig;
+
+template <class... _Values, template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+extern __fn_t<_Vy<_Values...>>* __transform_sig<set_value_t(_Values...), _Vy, _Ey, _Sy>;
+
+template <class _Error, template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+extern __fn_t<_Ey<_Error>>* __transform_sig<set_error_t(_Error), _Vy, _Ey, _Sy>;
+
+template <template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+extern __fn_t<_Sy>* __transform_sig<set_stopped_t(), _Vy, _Ey, _Sy>;
+
+template <class _Sig, template <class...> class _Vy, template <class...> class _Ey, class _Sy>
+using __transform_sig_t = decltype(__transform_sig<_Sig, _Vy, _Ey, _Sy>());
+
+template <class _Sigs,
+          template <class...>
+          class _Vy,
+          template <class...>
+          class _Ey,
+          class _Sy,
+          template <class...>
+          class _Variant,
+          class... _More>
+extern _DIAGNOSTIC<_Sigs> __transform_completion_signatures_v;
+
+template <class... _What,
+          template <class...>
+          class _Vy,
+          template <class...>
+          class _Ey,
+          class _Sy,
+          template <class...>
+          class _Variant,
+          class... _More>
+extern __fn_t<_ERROR<_What...>>*
+  __transform_completion_signatures_v<_ERROR<_What...>, _Vy, _Ey, _Sy, _Variant, _More...>;
+
+template <class... _Sigs,
+          template <class...>
+          class _Vy,
+          template <class...>
+          class _Ey,
+          class _Sy,
+          template <class...>
+          class _Variant,
+          class... _More>
+extern __fn_t<_Variant<__transform_sig_t<_Sigs, _Vy, _Ey, _Sy>..., _More...>>*
+  __transform_completion_signatures_v<completion_signatures<_Sigs...>, _Vy, _Ey, _Sy, _Variant, _More...>;
+
+template <class _Sigs,
+          template <class...>
+          class _Vy,
+          template <class...>
+          class _Ey,
+          class _Sy,
+          template <class...>
+          class _Variant,
+          class... _More>
+using __transform_completion_signatures =
+  decltype(__transform_completion_signatures_v<_Sigs, _Vy, _Ey, _Sy, _Variant, _More...>());
+
+template <class _WantedTag>
+struct __gather_sigs_fn;
+
+template <>
+struct __gather_sigs_fn<set_value_t>
+{
+  template <class _Sigs,
+            template <class...>
+            class _Then,
+            template <class...>
+            class _Else,
+            template <class...>
+            class _Variant,
+            class... _More>
+  using __f = __transform_completion_signatures<
+    _Sigs,
+    _Then,
+    __mbind_front_q<_Else, set_error_t>::template __f,
+    _Else<set_stopped_t>,
+    _Variant,
+    _More...>;
+};
+
+template <>
+struct __gather_sigs_fn<set_error_t>
+{
+  template <class _Sigs,
+            template <class...>
+            class _Then,
+            template <class...>
+            class _Else,
+            template <class...>
+            class _Variant,
+            class... _More>
+  using __f = __transform_completion_signatures<
+    _Sigs,
+    __mbind_front_q<_Else, set_value_t>::template __f,
+    _Then,
+    _Else<set_stopped_t>,
+    _Variant,
+    _More...>;
+};
+
+template <>
+struct __gather_sigs_fn<set_stopped_t>
+{
+  template <class _Sigs,
+            template <class...>
+            class _Then,
+            template <class...>
+            class _Else,
+            template <class...>
+            class _Variant,
+            class... _More>
+  using __f = __transform_completion_signatures<
+    _Sigs,
+    __mbind_front_q<_Else, set_value_t>::template __f,
+    __mbind_front_q<_Else, set_error_t>::template __f,
+    _Then<>,
+    _Variant,
+    _More...>;
+};
+
+template <class _Sigs,
+          class _WantedTag,
+          template <class...>
+          class _Then,
+          template <class...>
+          class _Else,
+          template <class...>
+          class _Variant,
+          class... _More>
+using __gather_completion_signatures =
+  typename __gather_sigs_fn<_WantedTag>::template __f<_Sigs, _Then, _Else, _Variant, _More...>;
+
+template <class... _Ts>
+using __set_value_transform_t = completion_signatures<set_value_t(_Ts...)>;
+
+template <class _Ty>
+using __set_error_transform_t = completion_signatures<set_error_t(_Ty)>;
+
+template <class... _Ts, class... _Us>
+auto operator*(__mset<_Ts...>&, __undefined<completion_signatures<_Us...>>&) -> __mset_insert<__mset<_Ts...>, _Us...>&;
+
+template <class... _Ts, class... _What>
+auto operator*(__mset<_Ts...>&, __undefined<_ERROR<_What...>>&) -> _ERROR<_What...>&;
+
+template <class... _What, class... _Us>
+auto operator*(_ERROR<_What...>&, __undefined<completion_signatures<_Us...>>&) -> _ERROR<_What...>&;
+
+template <class... _Sigs>
+using __concat_completion_signatures = //
+  __mapply_q<completion_signatures, __mconcat_into_q<__mmake_set>::__f<_Sigs...>>;
+
+template <class _Tag, class... _Ts>
+using __default_completions = completion_signatures<_Tag(_Ts...)>;
+
+template <class _Sigs,
+          class _MoreSigs                           = completion_signatures<>,
+          template <class...> class _ValueTransform = __set_value_transform_t,
+          template <class> class _ErrorTransform    = __set_error_transform_t,
+          class _StoppedSigs                        = completion_signatures<set_stopped_t()>>
+using transform_completion_signatures = //
+  __transform_completion_signatures<_Sigs,
+                                    _ValueTransform,
+                                    _ErrorTransform,
+                                    _StoppedSigs,
+                                    __mtry_quote<__concat_completion_signatures>::__f,
+                                    _MoreSigs>;
+
+template <class _Sndr,
+          class _Rcvr,
+          class _MoreSigs                           = completion_signatures<>,
+          template <class...> class _ValueTransform = __set_value_transform_t,
+          template <class> class _ErrorTransform    = __set_error_transform_t,
+          class _StoppedSigs                        = completion_signatures<set_stopped_t()>>
+using transform_completion_signatures_of = //
+  transform_completion_signatures<completion_signatures_of_t<_Sndr, _Rcvr>,
+                                  _MoreSigs,
+                                  _ValueTransform,
+                                  _ErrorTransform,
+                                  _StoppedSigs>;
+
+template <class _Sigs,
+          template <class...>
+          class _Tuple,
+          template <class...>
+          class _Variant>
+using __value_types = //
+  __transform_completion_signatures<_Sigs,
+                                    __mcompose_q<__mlist, _Tuple>::template __f,
+                                    __malways<__mlist<>>::__f,
+                                    __mlist<>,
+                                    __mconcat_into_q<_Variant>::template __f>;
+
+template <class _Sndr, class _Rcvr, template <class...> class _Tuple, template <class...> class _Variant>
+using value_types_of_t =
+  __value_types<completion_signatures_of_t<_Sndr, _Rcvr>, _Tuple, __mtry_quote<_Variant>::template __f>;
+
+template <class _Sigs,
+          template <class...>
+          class _Variant>
+using __error_types = //
+  __transform_completion_signatures<_Sigs,
+                                    __malways<__mlist<>>::__f,
+                                    __mlist,
+                                    __mlist<>,
+                                    __mconcat_into_q<_Variant>::template __f>;
+
+template <class _Sndr, class _Rcvr, template <class...> class _Variant>
+using error_types_of_t = __error_types<completion_signatures_of_t<_Sndr, _Rcvr>, _Variant>;
+
+template <class _Sigs>
+_CCCL_INLINE_VAR constexpr bool __sends_stopped = //
+  __transform_completion_signatures<_Sigs, __malways<__mfalse>::__f, __malways<__mfalse>::__f, __mtrue, __mor>::__value;
+
+template <class _Sndr, class _Rcvr = receiver_archetype>
+_CCCL_INLINE_VAR constexpr bool sends_stopped = //
+  __sends_stopped<completion_signatures_of_t<_Sndr, _Rcvr>>;
+
+using __eptr_completion = completion_signatures<set_error_t(::std::exception_ptr)>;
+
+template <bool _NoExcept>
+using __eptr_completion_if = _CUDA_VSTD::_If<_NoExcept, completion_signatures<>, __eptr_completion>;
+
+template <class>
+_CCCL_INLINE_VAR constexpr bool __is_completion_signatures = false;
+
+template <class... _Sigs>
+_CCCL_INLINE_VAR constexpr bool __is_completion_signatures<completion_signatures<_Sigs...>> = true;
+
+template <class _Sndr>
+using __is_non_dependent_detail_ = //
+  __mif<__is_completion_signatures<completion_signatures_of_t<_Sndr>>>;
+
+template <class _Sndr>
+_CCCL_INLINE_VAR constexpr bool __is_non_dependent_sender = __mvalid_q<__is_non_dependent_detail_, _Sndr>;
+
+namespace __csig
+{
+struct __dep
+{};
+
+template <class... _Sigs>
+struct __sigs;
+
+template <class... _As, class... _Bs>
+auto operator+(__sigs<_As...>&, __sigs<_Bs...>&) -> __sigs<_As..., _Bs...>&;
+
+template <class... _Sigs>
+auto operator+(__sigs<_Sigs...>&) //
+  -> __concat_completion_signatures<completion_signatures<_Sigs...>>;
+
+template <class _Other>
+auto __to_sigs(_Other&) -> _Other&;
+
+template <class... _Sigs>
+auto __to_sigs(completion_signatures<_Sigs...>&) -> __sigs<_Sigs...>&;
+} // namespace __csig
+
+using dependent_completions = __csig::__dep;
+
+namespace meta
+{
+template <class... _Sigs>
+using sigs = __csig::__sigs<_Sigs...>*;
+
+template <class _Tag, class... _Args>
+auto completion(_Tag, _Args&&...) -> __csig::__sigs<_Tag(_Args...)>&;
+
+template <class _Sndr, class _Rcvr = receiver_archetype>
+auto completions_of(_Sndr&&,
+                    _Rcvr = {}) -> decltype(__csig::__to_sigs(__declval<completion_signatures_of_t<_Sndr, _Rcvr>&>()));
+
+template <bool _PotentiallyThrowing>
+auto eptr_completion_if()
+  -> _CUDA_VSTD::_If<_PotentiallyThrowing, __csig::__sigs<set_error_t(::std::exception_ptr)>, __csig::__sigs<>>&;
+} // namespace meta
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/conditional.cuh b/cudax/include/cuda/experimental/__async/conditional.cuh
new file mode 100644
index 0000000000..3a02e4eec2
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/conditional.cuh
@@ -0,0 +1,238 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_CONDITIONAL
+#define __CUDAX_ASYNC_DETAIL_CONDITIONAL
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/just_from.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+//! \file conditional.cuh
+//! This file defines the \c conditional sender. \c conditional is a sender that
+//! selects between two continuations based on the result of a predecessor. It
+//! accepts a predecessor, a predicate, and two continuations. It passes the
+//! result of the predecessor to the predicate. If the predicate returns \c true,
+//! the result is passed to the first continuation; otherwise, it is passed to
+//! the second continuation.
+//!
+//! By "continuation", we mean a so-called sender adaptor closure: a unary function
+//! that takes a sender and returns a new sender. The expression `then(f)` is an
+//! example of a continuation.
+
+namespace cuda::experimental::__async
+{
+struct __cond_t
+{
+  template <class _Pred, class _Then, class _Else>
+  struct __data
+  {
+    _Pred __pred_;
+    _Then __then_;
+    _Else __else_;
+  };
+
+  template <class... _Args>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE static auto __mk_complete_fn(_Args&&... __args) noexcept
+  {
+    return [&](auto __sink) noexcept {
+      return __sink(static_cast<_Args&&>(__args)...);
+    };
+  }
+
+  template <class... _Args>
+  using __just_from_t = decltype(just_from(__cond_t::__mk_complete_fn(__declval<_Args>()...)));
+
+  template <class _Sndr, class _Rcvr, class _Pred, class _Then, class _Else>
+  struct __opstate
+  {
+    using operation_state_concept = operation_state_t;
+
+    _CCCL_HOST_DEVICE friend env_of_t<_Rcvr> get_env(const __opstate* __self) noexcept
+    {
+      return get_env(__self->__rcvr_);
+    }
+
+    template <class... _Args>
+    using __value_t = //
+      transform_completion_signatures<
+        completion_signatures_of_t<__call_result_t<_Then, __just_from_t<_Args...>>, __rcvr_ref_t<_Rcvr&>>,
+        completion_signatures_of_t<__call_result_t<_Else, __just_from_t<_Args...>>, __rcvr_ref_t<_Rcvr&>>>;
+
+    template <class... _Args>
+    using __opstate_t = //
+      __mlist< //
+        connect_result_t<__call_result_t<_Then, __just_from_t<_Args...>>, __rcvr_ref_t<_Rcvr&>>,
+        connect_result_t<__call_result_t<_Else, __just_from_t<_Args...>>, __rcvr_ref_t<_Rcvr&>>>;
+
+    using __next_ops_variant_t = //
+      __value_types<completion_signatures_of_t<_Sndr, __opstate*>, __opstate_t, __mconcat_into_q<__variant>::__f>;
+
+    using completion_signatures = //
+      transform_completion_signatures_of<_Sndr, __opstate*, __async::completion_signatures<>, __value_t>;
+
+    _CCCL_HOST_DEVICE __opstate(_Sndr&& __sndr, _Rcvr&& __rcvr, __data<_Pred, _Then, _Else>&& __data)
+        : __rcvr_{static_cast<_Rcvr&&>(__rcvr)}
+        , __data_{static_cast<__cond_t::__data<_Pred, _Then, _Else>>(__data)}
+        , __op_{__async::connect(static_cast<_Sndr&&>(__sndr), this)}
+    {}
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__op_);
+    }
+
+    template <class... _Args>
+    _CCCL_HOST_DEVICE void set_value(_Args&&... __args) noexcept
+    {
+      if (static_cast<_Pred&&>(__data_.__pred_)(__args...))
+      {
+        auto& __op = __ops_.__emplace_from(
+          connect,
+          static_cast<_Then&&>(__data_.__then_)(just_from(__cond_t::__mk_complete_fn(static_cast<_Args&&>(__args)...))),
+          __rcvr_ref(__rcvr_));
+        __async::start(__op);
+      }
+      else
+      {
+        auto& __op = __ops_.__emplace_from(
+          connect,
+          static_cast<_Else&&>(__data_.__else_)(just_from(__cond_t::__mk_complete_fn(static_cast<_Args&&>(__args)...))),
+          __rcvr_ref(__rcvr_));
+        __async::start(__op);
+      }
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __async::set_error(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __async::set_stopped(static_cast<_Rcvr&&>(__rcvr_));
+    }
+
+    _Rcvr __rcvr_;
+    __cond_t::__data<_Pred, _Then, _Else> __data_;
+    connect_result_t<_Sndr, __opstate*> __op_;
+    __next_ops_variant_t __ops_;
+  };
+
+  template <class _Sndr, class _Pred, class _Then, class _Else>
+  struct __sndr_t;
+
+  template <class _Pred, class _Then, class _Else>
+  struct __closure
+  {
+    __cond_t::__data<_Pred, _Then, _Else> __data_;
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto __mk_sender(_Sndr&& __sndr) //
+      -> __sndr_t<_Sndr, _Pred, _Then, _Else>;
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Sndr __sndr) //
+      -> __sndr_t<_Sndr, _Pred, _Then, _Else>
+    {
+      return __mk_sender(static_cast<_Sndr&&>(__sndr));
+    }
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE friend auto operator|(_Sndr __sndr, __closure&& __self) //
+      -> __sndr_t<_Sndr, _Pred, _Then, _Else>
+    {
+      return __self.__mk_sender(static_cast<_Sndr&&>(__sndr));
+    }
+  };
+
+  template <class _Sndr, class _Pred, class _Then, class _Else>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const //
+    -> __sndr_t<_Sndr, _Pred, _Then, _Else>;
+
+  template <class _Pred, class _Then, class _Else>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Pred __pred, _Then __then, _Else __else) const
+  {
+    return __closure<_Pred, _Then, _Else>{
+      {static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)}};
+  }
+};
+
+template <class _Sndr, class _Pred, class _Then, class _Else>
+struct __cond_t::__sndr_t
+{
+  __cond_t __tag_;
+  __cond_t::__data<_Pred, _Then, _Else> __data_;
+  _Sndr __sndr_;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && -> __opstate<_Sndr, _Rcvr, _Pred, _Then, _Else>
+  {
+    return {static_cast<_Sndr&&>(__sndr_),
+            static_cast<_Rcvr&&>(__rcvr),
+            static_cast<__cond_t::__data<_Pred, _Then, _Else>&&>(__data_)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& -> __opstate<_Sndr const&, _Rcvr, _Pred, _Then, _Else>
+  {
+    return {__sndr_, static_cast<_Rcvr&&>(__rcvr), static_cast<__cond_t::__data<_Pred, _Then, _Else>&&>(__data_)};
+  }
+
+  _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+  {
+    return __async::get_env(__sndr_);
+  }
+};
+
+template <class _Sndr, class _Pred, class _Then, class _Else>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto
+__cond_t::operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const //
+  -> __sndr_t<_Sndr, _Pred, _Then, _Else>
+{
+  if constexpr (__is_non_dependent_sender<_Sndr>)
+  {
+    using __completions = completion_signatures_of_t<__sndr_t<_Sndr, _Pred, _Then, _Else>>;
+    static_assert(__is_completion_signatures<__completions>);
+  }
+
+  return __sndr_t<_Sndr, _Pred, _Then, _Else>{
+    {},
+    {static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)},
+    static_cast<_Sndr&&>(__sndr)};
+}
+
+template <class _Pred, class _Then, class _Else>
+template <class _Sndr>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto __cond_t::__closure<_Pred, _Then, _Else>::__mk_sender(_Sndr&& __sndr) //
+  -> __sndr_t<_Sndr, _Pred, _Then, _Else>
+{
+  if constexpr (__is_non_dependent_sender<_Sndr>)
+  {
+    using __completions = completion_signatures_of_t<__sndr_t<_Sndr, _Pred, _Then, _Else>>;
+    static_assert(__is_completion_signatures<__completions>);
+  }
+
+  return __sndr_t<_Sndr, _Pred, _Then, _Else>{
+    {}, static_cast<__cond_t::__data<_Pred, _Then, _Else>&&>(__data_), static_cast<_Sndr&&>(__sndr)};
+}
+
+_CCCL_GLOBAL_CONSTANT __cond_t conditional{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/config.cuh b/cudax/include/cuda/experimental/__async/config.cuh
new file mode 100644
index 0000000000..06cb16cca8
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/config.cuh
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_CONFIG
+#define __CUDAX_ASYNC_DETAIL_CONFIG
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+namespace cuda::experimental::__async
+{
+}
+
+// Debuggers do not step into functions marked with __attribute__((__artificial__)).
+// This is useful for small wrapper functions that just dispatch to other functions and
+// that are inlined into the caller.
+#if __has_attribute(__artificial__) && !defined(__CUDACC__)
+#  define _CUDAX_ARTIFICIAL __attribute__((__artificial__))
+#else
+#  define _CUDAX_ARTIFICIAL
+#endif
+
+#define _CUDAX_ALWAYS_INLINE _CCCL_ALWAYS_INLINE _CUDAX_ARTIFICIAL _LIBCUDACXX_NODEBUG inline
+
+// GCC struggles with guaranteed copy elision of immovable types.
+#if defined(_CCCL_COMPILER_GCC)
+#  define _CUDAX_IMMOVABLE(_XP) _XP(_XP&&)
+#else
+#  define _CUDAX_IMMOVABLE(_XP) _XP(_XP&&) = delete
+#endif
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/continue_on.cuh b/cudax/include/cuda/experimental/__async/continue_on.cuh
new file mode 100644
index 0000000000..4f8cabbd97
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/continue_on.cuh
@@ -0,0 +1,288 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_CONTINUE_ON
+#define __CUDAX_ASYNC_DETAIL_CONTINUE_ON
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct continue_on_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  template <class... _As>
+  using __set_value_tuple_t = __tuple<set_value_t, __decay_t<_As>...>;
+
+  template <class _Error>
+  using __set_error_tuple_t = __tuple<set_error_t, __decay_t<_Error>>;
+
+  using __set_stopped_tuple_t = __tuple<set_stopped_t>;
+
+  using __complete_fn = void (*)(void*) noexcept;
+
+  template <class... _Ts>
+  using __set_value_completion =
+    _CUDA_VSTD::_If<__nothrow_decay_copyable<_Ts...>,
+                    completion_signatures<set_value_t(__decay_t<_Ts>...)>,
+                    completion_signatures<set_value_t(__decay_t<_Ts>...), set_error_t(::std::exception_ptr)>>;
+
+  template <class _Error>
+  using __set_error_completion =
+    _CUDA_VSTD::_If<__nothrow_decay_copyable<_Error>,
+                    completion_signatures<set_error_t(__decay_t<_Error>)>,
+                    completion_signatures<set_error_t(__decay_t<_Error>), set_error_t(::std::exception_ptr)>>;
+
+  template <class _Rcvr, class _Result>
+  struct __rcvr_t
+  {
+    using receiver_concept = receiver_t;
+    _Rcvr __rcvr_;
+    _Result __result_;
+    __complete_fn __complete_;
+
+    template <class _Tag, class... _As>
+    _CCCL_HOST_DEVICE void operator()(_Tag, _As&... __as) noexcept
+    {
+      _Tag()(static_cast<_Rcvr&&>(__rcvr_), static_cast<_As&&>(__as)...);
+    }
+
+    template <class _Tag, class... _As>
+    _CCCL_HOST_DEVICE void __set_result(_Tag, _As&&... __as) noexcept
+    {
+      using __tupl_t = __tuple<_Tag, __decay_t<_As>...>;
+      if constexpr (__nothrow_decay_copyable<_As...>)
+      {
+        __result_.template __emplace<__tupl_t>(_Tag(), static_cast<_As&&>(__as)...);
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ({ //
+            __result_.template __emplace<__tupl_t>(_Tag(), static_cast<_As&&>(__as)...);
+          }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __async::set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception());
+            }))
+      }
+      __complete_ = +[](void* __ptr) noexcept {
+        auto& __self = *static_cast<__rcvr_t*>(__ptr);
+        auto& __tupl = *static_cast<__tupl_t*>(__self.__result_.__ptr());
+        __tupl.__apply(__self, __tupl);
+      };
+    }
+
+    _CCCL_HOST_DEVICE void set_value() noexcept
+    {
+      __complete_(this);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __async::set_error(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __async::set_stopped(static_cast<_Rcvr&&>(__rcvr_));
+    }
+
+    _CCCL_HOST_DEVICE env_of_t<_Rcvr> get_env() const noexcept
+    {
+      return __async::get_env(__rcvr_);
+    }
+  };
+
+  template <class _Rcvr, class _CvSndr, class _Sch>
+  struct __opstate_t
+  {
+    _CCCL_HOST_DEVICE friend auto get_env(const __opstate_t* __self) noexcept -> env_of_t<_Rcvr>
+    {
+      return __async::get_env(__self->__rcvr_.__rcvr);
+    }
+
+    using operation_state_concept = operation_state_t;
+    using __result_t =
+      __transform_completion_signatures<completion_signatures_of_t<_CvSndr, __opstate_t*>,
+                                        __set_value_tuple_t,
+                                        __set_error_tuple_t,
+                                        __set_stopped_tuple_t,
+                                        __variant>;
+
+    // The scheduler contributes error and stopped completions.
+    // This causes its set_value_t() completion to be ignored.
+    using __scheduler_completions = //
+      transform_completion_signatures<completion_signatures_of_t<schedule_result_t<_Sch>, __rcvr_t<_Rcvr, __result_t>*>,
+                                      __async::completion_signatures<>,
+                                      __malways<__async::completion_signatures<>>::__f>;
+
+    // The continue_on completions are the scheduler's error
+    // and stopped completions, plus the sender's completions
+    // with all the result data types decayed.
+    using completion_signatures = //
+      transform_completion_signatures<completion_signatures_of_t<_CvSndr, __opstate_t*>,
+                                      __scheduler_completions,
+                                      __set_value_completion,
+                                      __set_error_completion>;
+
+    __rcvr_t<_Rcvr, __result_t> __rcvr_;
+    connect_result_t<_CvSndr, __opstate_t*> __opstate1_;
+    connect_result_t<schedule_result_t<_Sch>, __rcvr_t<_Rcvr, __result_t>*> __opstate2_;
+
+    _CCCL_HOST_DEVICE __opstate_t(_CvSndr&& __sndr, _Sch __sch, _Rcvr __rcvr)
+        : __rcvr_{static_cast<_Rcvr&&>(__rcvr), {}, nullptr}
+        , __opstate1_{__async::connect(static_cast<_CvSndr&&>(__sndr), this)}
+        , __opstate2_{__async::connect(schedule(__sch), &__rcvr_)}
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate1_);
+    }
+
+    template <class... _As>
+    _CCCL_HOST_DEVICE void set_value(_As&&... __as) noexcept
+    {
+      __rcvr_.__set_result(set_value_t(), static_cast<_As&&>(__as)...);
+      __async::start(__opstate2_);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __rcvr_.__set_result(set_error_t(), static_cast<_Error&&>(__error));
+      __async::start(__opstate2_);
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __rcvr_.__set_result(set_stopped_t());
+      __async::start(__opstate2_);
+    }
+  };
+
+  template <class _Sndr, class _Sch>
+  struct __sndr_t;
+
+  template <class _Sch>
+  struct __closure_t;
+
+public:
+  template <class _Sndr, class _Sch>
+  _CCCL_HOST_DEVICE __sndr_t<_Sndr, _Sch> operator()(_Sndr __sndr, _Sch __sch) const noexcept;
+
+  template <class _Sch>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE __closure_t<_Sch> operator()(_Sch __sch) const noexcept;
+};
+
+template <class _Sch>
+struct continue_on_t::__closure_t
+{
+  _Sch __sch;
+
+  template <class _Sndr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE friend auto operator|(_Sndr __sndr, __closure_t&& __self)
+  {
+    return continue_on_t()(static_cast<_Sndr&&>(__sndr), static_cast<_Sch&&>(__self.__sch));
+  }
+};
+
+template <class _Sndr, class _Sch>
+struct continue_on_t::__sndr_t
+{
+  using sender_concept = sender_t;
+  _CCCL_NO_UNIQUE_ADDRESS continue_on_t __tag;
+  _Sch __sch;
+  _Sndr __sndr;
+
+  struct __attrs_t
+  {
+    __sndr_t* __sndr;
+
+    template <class _SetTag>
+    _CCCL_HOST_DEVICE auto query(get_completion_scheduler_t<_SetTag>) const noexcept
+    {
+      return __sndr->__sch;
+    }
+
+    template <class _Query>
+    _CCCL_HOST_DEVICE auto query(_Query) const //
+      -> __query_result_t<_Query, env_of_t<_Sndr>>
+    {
+      return __async::get_env(__sndr->__sndr).__query(_Query{});
+    }
+  };
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE __opstate_t<_Rcvr, _Sndr, _Sch> connect(_Rcvr __rcvr) &&
+  {
+    return {static_cast<_Sndr&&>(__sndr), __sch, static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE __opstate_t<_Rcvr, const _Sndr&, _Sch> connect(_Rcvr __rcvr) const&
+  {
+    return {__sndr, __sch, static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  _CCCL_HOST_DEVICE __attrs_t get_env() const noexcept
+  {
+    return __attrs_t{this};
+  }
+};
+
+template <class _Sndr, class _Sch>
+_CCCL_HOST_DEVICE auto
+continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept -> continue_on_t::__sndr_t<_Sndr, _Sch>
+{
+  return __sndr_t<_Sndr, _Sch>{{}, __sch, static_cast<_Sndr&&>(__sndr)};
+}
+
+template <class _Sch>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE continue_on_t::__closure_t<_Sch>
+continue_on_t::operator()(_Sch __sch) const noexcept
+{
+  return __closure_t<_Sch>{__sch};
+}
+
+_CCCL_GLOBAL_CONSTANT continue_on_t continue_on{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/cpos.cuh b/cudax/include/cuda/experimental/__async/cpos.cuh
new file mode 100644
index 0000000000..a20f96e25b
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/cpos.cuh
@@ -0,0 +1,220 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_CPOS
+#define __CUDAX_ASYNC_DETAIL_CPOS
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/env.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct receiver_t
+{};
+
+struct operation_state_t
+{};
+
+struct sender_t
+{};
+
+struct scheduler_t
+{};
+
+template <class _Ty>
+using __sender_concept_t = typename __remove_ref_t<_Ty>::sender_concept;
+
+template <class _Ty>
+using __receiver_concept_t = typename __remove_ref_t<_Ty>::receiver_concept;
+
+template <class _Ty>
+using __scheduler_concept_t = typename __remove_ref_t<_Ty>::scheduler_concept;
+
+template <class _Ty>
+_CCCL_INLINE_VAR constexpr bool __is_sender = __mvalid_q<__sender_concept_t, _Ty>;
+
+template <class _Ty>
+_CCCL_INLINE_VAR constexpr bool __is_receiver = __mvalid_q<__receiver_concept_t, _Ty>;
+
+template <class _Ty>
+_CCCL_INLINE_VAR constexpr bool __is_scheduler = __mvalid_q<__scheduler_concept_t, _Ty>;
+
+_CCCL_GLOBAL_CONSTANT struct set_value_t
+{
+  template <class _Rcvr, class... _Ts>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Rcvr&& __rcvr, _Ts&&... __ts) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...))
+  {
+    static_assert(
+      _CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...)), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...)));
+    static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...);
+  }
+
+  template <class _Rcvr, class... _Ts>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Rcvr* __rcvr, _Ts&&... __ts) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_value(static_cast<_Ts&&>(__ts)...))
+  {
+    static_assert(
+      _CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(*__rcvr).set_value(static_cast<_Ts&&>(__ts)...)), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_value(static_cast<_Ts&&>(__ts)...)));
+    static_cast<_Rcvr&&>(*__rcvr).set_value(static_cast<_Ts&&>(__ts)...);
+  }
+} set_value{};
+
+_CCCL_GLOBAL_CONSTANT struct set_error_t
+{
+  template <class _Rcvr, class _Ey>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Rcvr&& __rcvr, _Ey&& __e) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e)))
+  {
+    static_assert(
+      _CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e))), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e))));
+    static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e));
+  }
+
+  template <class _Rcvr, class _Ey>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Rcvr* __rcvr, _Ey&& __e) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_error(static_cast<_Ey&&>(__e)))
+  {
+    static_assert(
+      _CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(*__rcvr).set_error(static_cast<_Ey&&>(__e))), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_error(static_cast<_Ey&&>(__e))));
+    static_cast<_Rcvr&&>(*__rcvr).set_error(static_cast<_Ey&&>(__e));
+  }
+} set_error{};
+
+_CCCL_GLOBAL_CONSTANT struct set_stopped_t
+{
+  template <class _Rcvr>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto
+  operator()(_Rcvr&& __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped())
+  {
+    static_assert(_CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_stopped()));
+    static_cast<_Rcvr&&>(__rcvr).set_stopped();
+  }
+
+  template <class _Rcvr>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto
+  operator()(_Rcvr* __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped())
+  {
+    static_assert(_CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped()), void>);
+    static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_stopped()));
+    static_cast<_Rcvr&&>(*__rcvr).set_stopped();
+  }
+} set_stopped{};
+
+_CCCL_GLOBAL_CONSTANT struct start_t
+{
+  template <class _OpState>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto
+  operator()(_OpState& __opstate) const noexcept -> decltype(__opstate.start())
+  {
+    static_assert(!__is_error<typename _OpState::completion_signatures>);
+    static_assert(_CUDA_VSTD::is_same_v<decltype(__opstate.start()), void>);
+    static_assert(noexcept(__opstate.start()));
+    __opstate.start();
+  }
+} start{};
+
+_CCCL_GLOBAL_CONSTANT struct connect_t
+{
+  template <class _Sndr, class _Rcvr>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Sndr&& __sndr, _Rcvr&& __rcvr) const
+    noexcept(noexcept(static_cast<_Sndr&&>(__sndr).connect(static_cast<_Rcvr&&>(__rcvr))))
+      -> decltype(static_cast<_Sndr&&>(__sndr).connect(static_cast<_Rcvr&&>(__rcvr)))
+  {
+    // using __opstate_t     = decltype(static_cast<_Sndr&&>(__sndr).connect(static_cast<_Rcvr&&>(__rcvr)));
+    // using completions_t = typename __opstate_t::completion_signatures;
+    // static_assert(__is_completion_signatures<completions_t>);
+
+    return static_cast<_Sndr&&>(__sndr).connect(static_cast<_Rcvr&&>(__rcvr));
+  }
+} connect{};
+
+_CCCL_GLOBAL_CONSTANT struct schedule_t
+{
+  template <class _Sch>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto
+  operator()(_Sch&& __sch) const noexcept -> decltype(static_cast<_Sch&&>(__sch).schedule())
+  {
+    static_assert(noexcept(static_cast<_Sch&&>(__sch).schedule()));
+    return static_cast<_Sch&&>(__sch).schedule();
+  }
+} schedule{};
+
+struct receiver_archetype
+{
+  using receiver_concept = receiver_t;
+
+  template <class... _Ts>
+  void set_value(_Ts&&...) noexcept;
+
+  template <class _Error>
+  void set_error(_Error&&) noexcept;
+
+  void set_stopped() noexcept;
+
+  env<> get_env() const noexcept;
+};
+
+template <class _Sndr, class _Rcvr>
+using connect_result_t = decltype(connect(__declval<_Sndr>(), __declval<_Rcvr>()));
+
+template <class _Sndr, class _Rcvr = receiver_archetype>
+using completion_signatures_of_t = typename connect_result_t<_Sndr, _Rcvr>::completion_signatures;
+
+template <class _Sch>
+using schedule_result_t = decltype(schedule(__declval<_Sch>()));
+
+template <class _Sndr, class _Rcvr>
+_CCCL_INLINE_VAR constexpr bool __nothrow_connectable = noexcept(connect(__declval<_Sndr>(), __declval<_Rcvr>()));
+
+// handy enumerations for keeping type names readable
+enum __disposition_t
+{
+  __value,
+  __error,
+  __stopped
+};
+
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __set_tag;
+template <class _Void>
+extern __fn_t<set_value_t>* __set_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<set_error_t>* __set_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<set_stopped_t>* __set_tag<__stopped, _Void>;
+} // namespace __detail
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/env.cuh b/cudax/include/cuda/experimental/__async/env.cuh
new file mode 100644
index 0000000000..8b5479e72a
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/env.cuh
@@ -0,0 +1,192 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_ENV
+#define __CUDAX_ASYNC_DETAIL_ENV
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__functional/reference_wrapper.h>
+
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <functional>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+// warning #20012-D: __device__ annotation is ignored on a
+// function("inplace_stop_source") that is explicitly defaulted on its first
+// declaration
+_CCCL_NV_DIAG_SUPPRESS(20012)
+
+namespace cuda::experimental::__async
+{
+template <class _Ty>
+extern _Ty __unwrap_ref;
+
+template <class _Ty>
+extern _Ty& __unwrap_ref<::std::reference_wrapper<_Ty>>;
+
+template <class _Ty>
+extern _Ty& __unwrap_ref<_CUDA_VSTD::reference_wrapper<_Ty>>;
+
+template <class _Ty>
+using __unwrap_reference_t = decltype(__unwrap_ref<_Ty>);
+
+template <class _Query, class _Value>
+struct prop
+{
+  _CCCL_NO_UNIQUE_ADDRESS _Query __query;
+  _CCCL_NO_UNIQUE_ADDRESS _Value __value;
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query) const noexcept -> const _Value&
+  {
+    return __value;
+  }
+};
+
+template <class... _Envs>
+struct env
+{
+  __tuple<_Envs...> __envs_;
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr decltype(auto) __get_1st(_Query) const noexcept
+  {
+    constexpr bool __flags[] = {__queryable<_Envs, _Query>..., false};
+    constexpr size_t __idx   = __async::__find_pos(__flags, __flags + sizeof...(_Envs));
+    if constexpr (__idx != __npos)
+    {
+      return __async::__cget<__idx>(__envs_);
+    }
+  }
+
+  template <class _Query, class _Env = env>
+  using __1st_env_t = decltype(__declval<const _Env&>().__get_1st(_Query{}));
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query __query) const
+    noexcept(__nothrow_queryable<__1st_env_t<_Query>, _Query>) //
+    -> __query_result_t<__1st_env_t<_Query>, _Query>
+  {
+    return __get_1st(__query).__query(__query);
+  }
+};
+
+// partial specialization for two environments
+template <class _Env0, class _Env1>
+struct env<_Env0, _Env1>
+{
+  _CCCL_NO_UNIQUE_ADDRESS _Env0 __env0_;
+  _CCCL_NO_UNIQUE_ADDRESS _Env1 __env1_;
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr decltype(auto) __get_1st(_Query) const noexcept
+  {
+    if constexpr (__queryable<_Env0, _Query>)
+    {
+      return (__env0_);
+    }
+    else if constexpr (__queryable<_Env1, _Query>)
+    {
+      return (__env1_);
+    }
+  }
+
+  template <class _Query, class _Env = env>
+  using __1st_env_t = decltype(__declval<const _Env&>().__get_1st(_Query{}));
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query __query) const
+    noexcept(__nothrow_queryable<__1st_env_t<_Query>, _Query>) //
+    -> __query_result_t<__1st_env_t<_Query>, _Query>
+  {
+    return __get_1st(__query).__query(__query);
+  }
+};
+
+template <class... _Envs>
+_CCCL_HOST_DEVICE env(_Envs...) -> env<__unwrap_reference_t<_Envs>...>;
+
+using empty_env = env<>;
+
+namespace __adl
+{
+template <class _Ty>
+_CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto get_env(_Ty* __ty) noexcept //
+  -> decltype(__ty->get_env())
+{
+  static_assert(noexcept(__ty->get_env()));
+  return __ty->get_env();
+}
+
+struct __get_env_t
+{
+  template <class _Ty>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Ty* __ty) const noexcept //
+    -> decltype(get_env(__ty))
+  {
+    static_assert(noexcept(get_env(__ty)));
+    return get_env(__ty);
+  }
+};
+} // namespace __adl
+
+struct get_env_t
+{
+  template <class _Ty>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Ty&& __ty) const noexcept //
+    -> decltype(__ty.get_env())
+  {
+    static_assert(noexcept(__ty.get_env()));
+    return __ty.get_env();
+  }
+
+  template <class _Ty>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Ty* __ty) const noexcept //
+    -> __call_result_t<__adl::__get_env_t, _Ty*>
+  {
+    return __adl::__get_env_t()(__ty);
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE empty_env operator()(__ignore) const noexcept
+  {
+    return {};
+  }
+};
+
+namespace __region
+{
+_CCCL_GLOBAL_CONSTANT get_env_t get_env{};
+} // namespace __region
+
+using namespace __region;
+
+template <class _Ty>
+using env_of_t = decltype(get_env(__declval<_Ty>()));
+} // namespace cuda::experimental::__async
+
+_CCCL_NV_DIAG_DEFAULT(20012)
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/epilogue.cuh b/cudax/include/cuda/experimental/__async/epilogue.cuh
new file mode 100644
index 0000000000..16438ed81f
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/epilogue.cuh
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#if !defined(_CUDAX_ASYNC_PROLOGUE_INCLUDED)
+#  __error epilogue.cuh included without a prior inclusion of prologue.cuh
+#endif
+
+#undef _CUDAX_ASYNC_PROLOGUE_INCLUDED
+
+_CCCL_DIAG_POP
diff --git a/cudax/include/cuda/experimental/__async/exception.cuh b/cudax/include/cuda/experimental/__async/exception.cuh
new file mode 100644
index 0000000000..f1b86d2328
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/exception.cuh
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_EXCEPTION
+#define __CUDAX_ASYNC_DETAIL_EXCEPTION
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+#include <exception> // IWYU pragma: keep export
+
+#if defined(__CUDACC__)
+#  include <nv/target>
+#  define _CUDAX_CATCH(...)
+#  define _CUDAX_TRY(_TRY, _CATCH) \
+    NV_IF_TARGET(NV_IS_HOST, (try { _NV_EVAL _TRY } catch (...){_NV_EVAL _CATCH}), ({_NV_EVAL _TRY}))
+#else
+#  define _CUDAX_CATCH(...)
+#  define _CUDAX_TRY(_TRY, _CATCH) _NV_EVAL(try { _NV_EVAL _TRY } catch (...){_NV_EVAL _CATCH})
+#endif
+
+#if defined(__CUDA_ARCH__)
+// Treat everything as no-throw in device code
+#  define _CUDAX_NOEXCEPT_EXPR(...) true
+#else
+// This is the default behavior for host code, and for nvc++
+#  define _CUDAX_NOEXCEPT_EXPR(...) noexcept(__VA_ARGS__)
+#endif
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/fwd_rcvr.cuh b/cudax/include/cuda/experimental/__async/fwd_rcvr.cuh
new file mode 100644
index 0000000000..1c6c2f684d
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/fwd_rcvr.cuh
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_FWD_RCVR
+#define __CUDAX_ASYNC_DETAIL_FWD_RCVR
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Rcvr>
+struct __fwd_rcvr : _Rcvr
+{
+  _CCCL_HOST_DEVICE decltype(auto) get_env() const noexcept
+  {
+    // TODO: only forward the "forwarding" queries:
+    return __async::get_env(static_cast<_Rcvr const&>(*this));
+  }
+};
+
+template <class _Rcvr>
+struct __fwd_rcvr<_Rcvr*>
+{
+  using receiver_concept = receiver_t;
+  _Rcvr* __rcvr_;
+
+  template <class... _As>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_As&&... __as) noexcept
+  {
+    __async::set_value(__rcvr_);
+  }
+
+  template <class _Error>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept
+  {
+    __async::set_error(__rcvr_, static_cast<_Error&&>(__error));
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_stopped() noexcept
+  {
+    __async::set_stopped(__rcvr_);
+  }
+
+  _CCCL_HOST_DEVICE decltype(auto) get_env() const noexcept
+  {
+    // TODO: only forward the "forwarding" queries:
+    return __async::get_env(__rcvr_);
+  }
+};
+} // namespace cuda::experimental::__async
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/just.cuh b/cudax/include/cuda/experimental/__async/just.cuh
new file mode 100644
index 0000000000..fe8be694db
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/just.cuh
@@ -0,0 +1,134 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_JUST
+#define __CUDAX_ASYNC_DETAIL_JUST
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// Forward declarations of the just* tag types:
+struct just_t;
+struct just_error_t;
+struct just_stopped_t;
+
+// Map from a disposition to the corresponding tag types:
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __just_tag;
+template <class _Void>
+extern __fn_t<just_t>* __just_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<just_error_t>* __just_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<just_stopped_t>* __just_tag<__stopped, _Void>;
+} // namespace __detail
+
+template <__disposition_t _Disposition>
+struct __just
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+
+  using _JustTag = decltype(__detail::__just_tag<_Disposition>());
+  using _SetTag  = decltype(__detail::__set_tag<_Disposition>());
+
+  template <class _Rcvr, class... _Ts>
+  struct __opstate_t
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __async::completion_signatures<_SetTag(_Ts...)>;
+    _Rcvr __rcvr_;
+    __tuple<_Ts...> __values_;
+
+    struct __complete_fn
+    {
+      __opstate_t* __self_;
+
+      _CCCL_HOST_DEVICE void operator()(_Ts&... __ts) const noexcept
+      {
+        _SetTag()(static_cast<_Rcvr&&>(__self_->__rcvr_), static_cast<_Ts&&>(__ts)...);
+      }
+    };
+
+    _CCCL_HOST_DEVICE void start() & noexcept
+    {
+      __values_.__apply(__complete_fn{this}, __values_);
+    }
+  };
+
+  template <class... _Ts>
+  struct __sndr_t
+  {
+    using sender_concept        = sender_t;
+    using completion_signatures = __async::completion_signatures<_SetTag(_Ts...)>;
+
+    _CCCL_NO_UNIQUE_ADDRESS _JustTag __tag_;
+    __tuple<_Ts...> __values_;
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE __opstate_t<_Rcvr, _Ts...> connect(_Rcvr __rcvr) && //
+      noexcept(__nothrow_decay_copyable<_Rcvr, _Ts...>)
+    {
+      return __opstate_t<_Rcvr, _Ts...>{static_cast<_Rcvr&&>(__rcvr), static_cast<__tuple<_Ts...>&&>(__values_)};
+    }
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE __opstate_t<_Rcvr, _Ts...> connect(_Rcvr __rcvr) const& //
+      noexcept(__nothrow_decay_copyable<_Rcvr, _Ts const&...>)
+    {
+      return __opstate_t<_Rcvr, _Ts...>{static_cast<_Rcvr&&>(__rcvr), __values_};
+    }
+  };
+
+public:
+  template <class... _Ts>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Ts... __ts) const noexcept
+  {
+    return __sndr_t<_Ts...>{_JustTag{}, {{static_cast<_Ts&&>(__ts)}...}};
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct just_t : __just<__value>
+{
+} just{};
+
+_CCCL_GLOBAL_CONSTANT struct just_error_t : __just<__error>
+{
+} just_error{};
+
+_CCCL_GLOBAL_CONSTANT struct just_stopped_t : __just<__stopped>
+{
+} just_stopped{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/just_from.cuh b/cudax/include/cuda/experimental/__async/just_from.cuh
new file mode 100644
index 0000000000..2df102ffa3
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/just_from.cuh
@@ -0,0 +1,163 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_JUST_FROM
+#define __CUDAX_ASYNC_DETAIL_JUST_FROM
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/rcvr_ref.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// Forward declarations of the just* tag types:
+struct just_from_t;
+struct just_error_from_t;
+struct just_stopped_from_t;
+
+// Map from a disposition to the corresponding tag types:
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __just_from_tag;
+template <class _Void>
+extern __fn_t<just_from_t>* __just_from_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<just_error_from_t>* __just_from_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<just_stopped_from_t>* __just_from_tag<__stopped, _Void>;
+} // namespace __detail
+
+struct _AN_ERROR_COMPLETION_MUST_HAVE_EXACTLY_ONE_ERROR_ARGUMENT;
+struct _A_STOPPED_COMPLETION_MUST_HAVE_NO_ARGUMENTS;
+
+template <__disposition_t _Disposition>
+struct __just_from
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+
+  using _JustTag = decltype(__detail::__just_from_tag<_Disposition>());
+  using _SetTag  = decltype(__detail::__set_tag<_Disposition>());
+
+  using __diag_t = _CUDA_VSTD::_If<_CUDA_VSTD::is_same_v<_SetTag, set_error_t>,
+                                   _AN_ERROR_COMPLETION_MUST_HAVE_EXACTLY_ONE_ERROR_ARGUMENT,
+                                   _A_STOPPED_COMPLETION_MUST_HAVE_NO_ARGUMENTS>;
+
+  template <class... _Ts>
+  using __error_t =
+    _ERROR<_WHERE(_IN_ALGORITHM, _JustTag), _WHAT(__diag_t), _WITH_COMPLETION_SIGNATURE<_SetTag(_Ts...)>>;
+
+  struct __probe_fn
+  {
+    template <class... _Ts>
+    auto operator()(_Ts&&... __ts) const noexcept
+      -> _CUDA_VSTD::
+        _If<__is_valid_signature<_SetTag(_Ts...)>, completion_signatures<_SetTag(_Ts...)>, __error_t<_Ts...>>;
+  };
+
+  template <class _Rcvr = receiver_archetype>
+  struct __complete_fn
+  {
+    _Rcvr& __rcvr_;
+
+    template <class... _Ts>
+    _CCCL_HOST_DEVICE auto operator()(_Ts&&... __ts) const noexcept
+    {
+      _SetTag()(static_cast<_Rcvr&>(__rcvr_), static_cast<_Ts&&>(__ts)...);
+    }
+  };
+
+  template <class _Rcvr, class _Fn>
+  struct __opstate
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __call_result_t<_Fn, __probe_fn>;
+    static_assert(__is_completion_signatures<completion_signatures>);
+
+    _Rcvr __rcvr_;
+    _Fn __fn_;
+
+    _CCCL_HOST_DEVICE void start() & noexcept
+    {
+      static_cast<_Fn&&>(__fn_)(__complete_fn<_Rcvr>{__rcvr_});
+    }
+  };
+
+  template <class _Fn>
+  struct __sndr_t
+  {
+    using sender_concept = sender_t;
+
+    _CCCL_NO_UNIQUE_ADDRESS _JustTag __tag_;
+    _Fn __fn_;
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE __opstate<_Rcvr, _Fn> connect(_Rcvr __rcvr) && //
+      noexcept(__nothrow_decay_copyable<_Rcvr, _Fn>)
+    {
+      return __opstate<_Rcvr, _Fn>{static_cast<_Rcvr&&>(__rcvr), static_cast<_Fn&&>(__fn_)};
+    }
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE __opstate<_Rcvr, _Fn> connect(_Rcvr __rcvr) const& //
+      noexcept(__nothrow_decay_copyable<_Rcvr, _Fn const&>)
+    {
+      return __opstate<_Rcvr, _Fn>{static_cast<_Rcvr&&>(__rcvr), __fn_};
+    }
+  };
+
+public:
+  template <class _Fn>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Fn __fn) const noexcept
+  {
+    using __completions = __call_result_t<_Fn, __probe_fn>;
+    static_assert(__is_completion_signatures<__completions>,
+                  "The function passed to just_from must return an instance of a specialization of "
+                  "completion_signatures<>.");
+    return __sndr_t<_Fn>{{}, static_cast<_Fn&&>(__fn)};
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct just_from_t : __just_from<__value>
+{
+} just_from{};
+
+_CCCL_GLOBAL_CONSTANT struct just_error_from_t : __just_from<__error>
+{
+} just_error_from{};
+
+_CCCL_GLOBAL_CONSTANT struct just_stopped_from_t : __just_from<__stopped>
+{
+} just_stopped_from{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/lazy.cuh b/cudax/include/cuda/experimental/__async/lazy.cuh
new file mode 100644
index 0000000000..0904dcdc50
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/lazy.cuh
@@ -0,0 +1,158 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_LAZY
+#define __CUDAX_ASYNC_DETAIL_LAZY
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__memory/addressof.h>
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__new/launder.h>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+
+#include <new> // IWYU pragma: keep
+
+namespace cuda::experimental::__async
+{
+/// @brief A lazy type that can be used to delay the construction of a type.
+template <class _Ty>
+struct __lazy
+{
+  _CCCL_HOST_DEVICE __lazy() noexcept {}
+
+  _CCCL_HOST_DEVICE ~__lazy() {}
+
+  template <class... _Ts>
+  _CCCL_HOST_DEVICE _Ty& construct(_Ts&&... __ts) noexcept(__nothrow_constructible<_Ty, _Ts...>)
+  {
+    _Ty* __value_ = ::new (static_cast<void*>(_CUDA_VSTD::addressof(__value_))) _Ty{static_cast<_Ts&&>(__ts)...};
+    return *_CUDA_VSTD::launder(__value_);
+  }
+
+  template <class _Fn, class... _Ts>
+  _CCCL_HOST_DEVICE _Ty& construct_from(_Fn&& __fn, _Ts&&... __ts) noexcept(__nothrow_callable<_Fn, _Ts...>)
+  {
+    _Ty* __value_ = ::new (static_cast<void*>(_CUDA_VSTD::addressof(__value_)))
+      _Ty{static_cast<_Fn&&>(__fn)(static_cast<_Ts&&>(__ts)...)};
+    return *_CUDA_VSTD::launder(__value_);
+  }
+
+  _CCCL_HOST_DEVICE void destroy() noexcept
+  {
+    _CUDA_VSTD::destroy_at(&__value_);
+  }
+
+  union
+  {
+    _Ty __value_;
+  };
+};
+
+namespace __detail
+{
+template <size_t _Idx, size_t _Size, size_t _Align>
+struct __lazy_box_
+{
+  static_assert(_Size != 0);
+  alignas(_Align) unsigned char __data_[_Size];
+};
+
+template <size_t _Idx, class _Ty>
+using __lazy_box = __lazy_box_<_Idx, sizeof(_Ty), alignof(_Ty)>;
+} // namespace __detail
+
+template <class _Idx, class... _Ts>
+struct __lazy_tupl;
+
+template <>
+struct __lazy_tupl<__mindices<>>
+{
+  template <class _Fn, class _Self, class... _Us>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE static auto __apply(_Fn&& __fn, _Self&&, _Us&&... __us) //
+    noexcept(__nothrow_callable<_Fn, _Us...>) -> __call_result_t<_Fn, _Us...>
+  {
+    return static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)...);
+  }
+};
+
+template <size_t... _Idx, class... _Ts>
+struct __lazy_tupl<__mindices<_Idx...>, _Ts...> : __detail::__lazy_box<_Idx, _Ts>...
+{
+  template <size_t _Ny>
+  using __at = __m_at_c<_Ny, _Ts...>;
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE __lazy_tupl() noexcept {}
+
+  _CCCL_HOST_DEVICE ~__lazy_tupl()
+  {
+    ((__engaged_[_Idx] ? _CUDA_VSTD::destroy_at(__get<_Idx, _Ts>()) : void(0)), ...);
+  }
+
+  template <size_t _Ny, class _Ty>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE _Ty* __get() noexcept
+  {
+    return reinterpret_cast<_Ty*>(this->__detail::__lazy_box<_Ny, _Ty>::__data_);
+  }
+
+  template <size_t _Ny, class... _Us>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE __at<_Ny>& __emplace(_Us&&... __us) //
+    noexcept(__nothrow_constructible<__at<_Ny>, _Us...>)
+  {
+    using _Ty       = __at<_Ny>;
+    _Ty* __value_   = ::new (static_cast<void*>(__get<_Ny, _Ty>())) _Ty{static_cast<_Us&&>(__us)...};
+    __engaged_[_Ny] = true;
+    return *_CUDA_VSTD::launder(__value_);
+  }
+
+  template <class _Fn, class _Self, class... _Us>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE static auto __apply(_Fn&& __fn, _Self&& __self, _Us&&... __us) //
+    noexcept(__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>...>)
+      -> __call_result_t<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>...>
+  {
+    return static_cast<_Fn&&>(__fn)(
+      static_cast<_Us&&>(__us)..., static_cast<__copy_cvref_t<_Self, _Ts>&&>(*__self.template __get<_Idx, _Ts>())...);
+  }
+
+  bool __engaged_[sizeof...(_Ts)] = {};
+};
+
+#if defined(_CCCL_COMPILER_MSVC)
+template <class... _Ts>
+struct __mk_lazy_tuple_
+{
+  using __indices_t = __mmake_indices<sizeof...(_Ts)>;
+  using type        = __lazy_tupl<__indices_t, _Ts...>;
+};
+
+template <class... _Ts>
+using __lazy_tuple = __t<__mk_lazy_tuple_<_Ts...>>;
+#else
+template <class... _Ts>
+using __lazy_tuple = __lazy_tupl<__mmake_indices<sizeof...(_Ts)>, _Ts...>;
+#endif
+
+template <class... _Ts>
+using __decayed_lazy_tuple = __lazy_tuple<__decay_t<_Ts>...>;
+
+} // namespace cuda::experimental::__async
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/let_value.cuh b/cudax/include/cuda/experimental/__async/let_value.cuh
new file mode 100644
index 0000000000..fb0c54cc4e
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/let_value.cuh
@@ -0,0 +1,326 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_LET_VALUE
+#define __CUDAX_ASYNC_DETAIL_LET_VALUE
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/rcvr_ref.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// Declare types to use for diagnostics:
+struct _FUNCTION_MUST_RETURN_A_SENDER;
+
+// Forward-declate the let_* algorithm tag types:
+struct let_value_t;
+struct let_error_t;
+struct let_stopped_t;
+
+// Map from a disposition to the corresponding tag types:
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __let_tag;
+template <class _Void>
+extern __fn_t<let_value_t>* __let_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<let_error_t>* __let_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<let_stopped_t>* __let_tag<__stopped, _Void>;
+} // namespace __detail
+
+template <__disposition_t _Disposition>
+struct __let
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  using _LetTag = decltype(__detail::__let_tag<_Disposition>());
+  using _SetTag = decltype(__detail::__set_tag<_Disposition>());
+
+  template <class...>
+  using __empty_tuple = __tuple<>;
+
+  /// @brief Computes the type of a variant of tuples to hold the results of
+  /// the predecessor sender.
+  template <class _CvSndr, class _Rcvr>
+  using __results =
+    __gather_completion_signatures<completion_signatures_of_t<_CvSndr, _Rcvr>,
+                                   _SetTag,
+                                   __decayed_tuple,
+                                   __empty_tuple,
+                                   __variant>;
+
+  template <class _Fn, class _Rcvr>
+  struct __opstate_fn
+  {
+    template <class... _As>
+    using __f = connect_result_t<__call_result_t<_Fn, __decay_t<_As>&...>, __rcvr_ref_t<_Rcvr&>>;
+  };
+
+  /// @brief Computes the type of a variant of operation states to hold
+  /// the second operation state.
+  template <class _CvSndr, class _Fn, class _Rcvr>
+  using __opstate2_t =
+    __gather_completion_signatures<completion_signatures_of_t<_CvSndr, _Rcvr>,
+                                   _SetTag,
+                                   __opstate_fn<_Fn, _Rcvr>::template __f,
+                                   __empty_tuple,
+                                   __variant>;
+
+  template <class _Fn, class _Rcvr>
+  struct __completions_fn
+  {
+    using __error_non_sender_return = //
+      _ERROR<_WHERE(_IN_ALGORITHM, _LetTag), _WHAT(_FUNCTION_MUST_RETURN_A_SENDER), _WITH_FUNCTION(_Fn)>;
+
+    template <class _Ty>
+    using __ensure_sender = //
+      _CUDA_VSTD::_If<__is_sender<_Ty> || __is_error<_Ty>, _Ty, __error_non_sender_return>;
+
+    template <class... _As>
+    using __error_not_callable_with = //
+      _ERROR<_WHERE(_IN_ALGORITHM, _LetTag),
+             _WHAT(_FUNCTION_IS_NOT_CALLABLE),
+             _WITH_FUNCTION(_Fn),
+             _WITH_ARGUMENTS(_As...)>;
+
+    // This computes the result of calling the function with the
+    // predecessor sender's results. If the function is not callable with
+    // the results, it returns an _ERROR.
+    template <class... _As>
+    using __call_result =
+      __minvoke<__mtry_quote<__call_result_t, __error_not_callable_with<_As...>>, _Fn, __decay_t<_As>&...>;
+
+    // This computes the completion signatures of sender returned by the
+    // function when called with the given arguments. It returns an _ERROR if
+    // the function is not callable with the arguments or if the function
+    // returns a non-sender.
+    template <class... _As>
+    using __f =
+      __mtry_invoke_q<completion_signatures_of_t, __ensure_sender<__call_result<_As...>>, __rcvr_ref_t<_Rcvr&>>;
+  };
+
+  /// @brief Computes the completion signatures of the
+  /// `let_(value|error|stopped)` sender.
+  template <class _CvSndr, class _Fn, class _Rcvr>
+  using __completions =
+    __gather_completion_signatures<completion_signatures_of_t<_CvSndr, _Rcvr>,
+                                   _SetTag,
+                                   __completions_fn<_Fn, _Rcvr>::template __f,
+                                   __default_completions,
+                                   __mbind_front<__mtry_quote<__concat_completion_signatures>, __eptr_completion>::__f>;
+
+  /// @brief The `let_(value|error|stopped)` operation state.
+  /// @tparam _CvSndr The cvref-qualified predecessor sender type.
+  /// @tparam _Fn The function to be called when the predecessor sender
+  /// completes.
+  /// @tparam _Rcvr The receiver connected to the `let_(value|error|stopped)`
+  /// sender.
+  template <class _Rcvr, class _CvSndr, class _Fn>
+  struct __opstate_t
+  {
+    _CCCL_HOST_DEVICE friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept
+    {
+      return __async::get_env(__self->__rcvr_);
+    }
+
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __completions<_CvSndr, _Fn, _Rcvr>;
+
+    // Don't try to compute the type of the variant of operation states
+    // if the computation of the completion signatures failed.
+    using __deferred_opstate_fn = __mbind_back<__mtry_quote<__opstate2_t>, _CvSndr, _Fn, _Rcvr>;
+    using __opstate_variant_fn =
+      _CUDA_VSTD::_If<__is_error<completion_signatures>, __malways<__empty>, __deferred_opstate_fn>;
+    using __opstate_variant_t = __mtry_invoke<__opstate_variant_fn>;
+
+    _Rcvr __rcvr_;
+    _Fn __fn_;
+    __results<_CvSndr, __opstate_t*> __result_;
+    connect_result_t<_CvSndr, __opstate_t*> __opstate1_;
+    __opstate_variant_t __opstate2_;
+
+    _CCCL_HOST_DEVICE __opstate_t(_CvSndr&& __sndr, _Fn __fn, _Rcvr __rcvr) noexcept(
+      __nothrow_decay_copyable<_Fn, _Rcvr> && __nothrow_connectable<_CvSndr, __opstate_t*>)
+        : __rcvr_(static_cast<_Rcvr&&>(__rcvr))
+        , __fn_(static_cast<_Fn&&>(__fn))
+        , __opstate1_(__async::connect(static_cast<_CvSndr&&>(__sndr), this))
+    {}
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate1_);
+    }
+
+    template <class _Tag, class... _As>
+    _CCCL_HOST_DEVICE void __complete(_Tag, _As&&... __as) noexcept
+    {
+      if constexpr (_CUDA_VSTD::is_same_v<_Tag, _SetTag>)
+      {
+        _CUDAX_TRY( //
+          ({ //
+            // Store the results so the lvalue refs we pass to the function
+            // will be valid for the duration of the async op.
+            auto& __tupl = __result_.template __emplace<__decayed_tuple<_As...>>(static_cast<_As&&>(__as)...);
+            if constexpr (!__is_error<completion_signatures>)
+            {
+              // Call the function with the results and connect the resulting
+              // sender, storing the operation state in __opstate2_.
+              auto& __nextop = __opstate2_.__emplace_from(
+                __async::connect, __tupl.__apply(static_cast<_Fn&&>(__fn_), __tupl), __async::__rcvr_ref(__rcvr_));
+              __async::start(__nextop);
+            }
+          }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __async::set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception());
+            }))
+      }
+      else
+      {
+        // Forward the completion to the receiver unchanged.
+        _Tag()(static_cast<_Rcvr&&>(__rcvr_), static_cast<_As&&>(__as)...);
+      }
+    }
+
+    template <class... _As>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_As&&... __as) noexcept
+    {
+      __complete(set_value_t(), static_cast<_As&&>(__as)...);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept
+    {
+      __complete(set_error_t(), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_stopped() noexcept
+    {
+      __complete(set_stopped_t());
+    }
+  };
+
+  /// @brief The `let_(value|error|stopped)` sender.
+  /// @tparam _Sndr The predecessor sender.
+  /// @tparam _Fn The function to be called when the predecessor sender
+  /// completes.
+  template <class _Sndr, class _Fn>
+  struct __sndr_t
+  {
+    using sender_concept = sender_t;
+    _CCCL_NO_UNIQUE_ADDRESS _LetTag __tag_;
+    _Fn __fn_;
+    _Sndr __sndr_;
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && noexcept(
+      __nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>) -> __opstate_t<_Rcvr, _Sndr, _Fn>
+    {
+      return __opstate_t<_Rcvr, _Sndr, _Fn>(
+        static_cast<_Sndr&&>(__sndr_), static_cast<_Fn&&>(__fn_), static_cast<_Rcvr&&>(__rcvr));
+    }
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& noexcept( //
+      __nothrow_constructible<__opstate_t<_Rcvr, const _Sndr&, _Fn>,
+                              const _Sndr&,
+                              const _Fn&,
+                              _Rcvr>) //
+      -> __opstate_t<_Rcvr, const _Sndr&, _Fn>
+    {
+      return __opstate_t<_Rcvr, const _Sndr&, _Fn>(__sndr_, __fn_, static_cast<_Rcvr&&>(__rcvr));
+    }
+
+    _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+    {
+      return __async::get_env(__sndr_);
+    }
+  };
+
+  template <class _Fn>
+  struct __closure_t
+  {
+    using _LetTag = decltype(__detail::__let_tag<_Disposition>());
+    _Fn __fn_;
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Sndr __sndr) const //
+      -> __call_result_t<_LetTag, _Sndr, _Fn>
+    {
+      return _LetTag()(static_cast<_Sndr&&>(__sndr), __fn_);
+    }
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE friend auto operator|(_Sndr __sndr, const __closure_t& __self) //
+      -> __call_result_t<_LetTag, _Sndr, _Fn>
+    {
+      return _LetTag()(static_cast<_Sndr&&>(__sndr), __self.__fn_);
+    }
+  };
+
+public:
+  template <class _Sndr, class _Fn>
+  _CCCL_HOST_DEVICE __sndr_t<_Sndr, _Fn> operator()(_Sndr __sndr, _Fn __fn) const
+  {
+    // If the incoming sender is non-dependent, we can check the completion
+    // signatures of the composed sender immediately.
+    if constexpr (__is_non_dependent_sender<_Sndr>)
+    {
+      using __completions = completion_signatures_of_t<__sndr_t<_Sndr, _Fn>>;
+      static_assert(__is_completion_signatures<__completions>);
+    }
+    return __sndr_t<_Sndr, _Fn>{{}, static_cast<_Fn&&>(__fn), static_cast<_Sndr&&>(__sndr)};
+  }
+
+  template <class _Fn>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Fn __fn) const noexcept
+  {
+    return __closure_t<_Fn>{static_cast<_Fn&&>(__fn)};
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct let_value_t : __let<__value>
+{
+} let_value{};
+
+_CCCL_GLOBAL_CONSTANT struct let_error_t : __let<__error>
+{
+} let_error{};
+
+_CCCL_GLOBAL_CONSTANT struct let_stopped_t : __let<__stopped>
+{
+} let_stopped{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/meta.cuh b/cudax/include/cuda/experimental/__async/meta.cuh
new file mode 100644
index 0000000000..78081cba74
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/meta.cuh
@@ -0,0 +1,753 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_META
+#define __CUDAX_ASYNC_DETAIL_META
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_base_of.h>
+#include <cuda/std/__utility/integer_sequence.h>
+
+#include <cuda/experimental/__async/config.cuh>
+
+#if __cpp_lib_three_way_comparison
+#  include <compare>
+#endif
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wgnu-string-literal-operator-template")
+_CCCL_DIAG_SUPPRESS_GCC("-Wnon-template-friend")
+
+namespace cuda::experimental::__async
+{
+template <class _Ret, class... _Args>
+using __fn_t = _Ret(_Args...);
+
+template <class _Ret, class... _Args>
+using __nothrow_fn_t = _Ret(_Args...) noexcept;
+
+template <class _Ty>
+_Ty&& __declval() noexcept;
+
+template <class...>
+using __mvoid = void;
+
+template <class _Ty>
+struct __mtype
+{
+  using type = _Ty;
+};
+
+template <class _Ty>
+using __t = typename _Ty::type;
+
+template <class... _Ts>
+struct __mlist;
+
+template <auto _Val>
+struct __mvalue
+{
+  static constexpr auto __value = _Val;
+};
+
+// A separate __mbool template is needed in addition to __mvalue
+// because of an EDG bug in the handling of auto template parameters.
+template <bool _Val>
+struct __mbool
+{
+  static constexpr auto __value = _Val;
+};
+
+using __mtrue  = __mbool<true>;
+using __mfalse = __mbool<false>;
+
+template <auto... _Vals>
+struct __mvalues;
+
+template <size_t... _Vals>
+struct __moffsets;
+
+template <class... _Bools>
+using __mand = __mbool<(_Bools::__value && ...)>;
+
+template <class... _Bools>
+using __mor = __mbool<(_Bools::__value || ...)>;
+
+template <size_t... _Idx>
+using __mindices = _CUDA_VSTD::index_sequence<_Idx...>*;
+
+template <size_t Count>
+using __mmake_indices = _CUDA_VSTD::make_index_sequence<Count>*;
+
+template <class... _Ts>
+using __mmake_indices_for = _CUDA_VSTD::make_index_sequence<sizeof...(_Ts)>*;
+
+constexpr size_t __mpow2(size_t __size) noexcept
+{
+  --__size;
+  __size |= __size >> 1;
+  __size |= __size >> 2;
+  __size |= __size >> 4;
+  __size |= __size >> 8;
+  if constexpr (sizeof(__size) >= 4)
+  {
+    __size |= __size >> 16;
+  }
+  if constexpr (sizeof(__size) >= 8)
+  {
+    __size |= __size >> 32;
+  }
+  return ++__size;
+}
+
+template <class _Ty>
+constexpr _Ty __mmin(_Ty __lhs, _Ty __rhs) noexcept
+{
+  return __lhs < __rhs ? __lhs : __rhs;
+}
+
+template <class _Ty>
+constexpr int __mcompare(_Ty __lhs, _Ty __rhs) noexcept
+{
+  return __lhs < __rhs ? -1 : __lhs > __rhs ? 1 : 0;
+}
+
+template <size_t _Len>
+struct __mstring
+{
+  template <size_t _Ny, size_t... _Is>
+  constexpr __mstring(const char (&__str)[_Ny], __mindices<_Is...>) noexcept
+      : __len_{_Ny}
+      , __what_{(_Is < _Ny ? __str[_Is] : '\0')...}
+  {}
+
+  template <size_t _Ny>
+  constexpr __mstring(const char (&__str)[_Ny], int = 0) noexcept
+      : __mstring{__str, __mmake_indices<_Len>{}}
+  {}
+
+  constexpr auto length() const noexcept -> size_t
+  {
+    return __len_;
+  }
+
+  template <size_t _OtherLen>
+  constexpr int compare(const __mstring<_OtherLen>& __other) const noexcept
+  {
+    size_t const len = __mmin(__len_, __other.__len_);
+    for (size_t i = 0; i < len; ++i)
+    {
+      if (auto const cmp = __mcompare(__what_[i], __other.__what_[i]))
+      {
+        return cmp;
+      }
+    }
+    return __mcompare(__len_, __other.__len_);
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator==(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return __len_ == __other.__len_ && compare(__other) == 0;
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator!=(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return !operator==(__other);
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator<(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return compare(__other) < 0;
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator>(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return compare(__other) > 0;
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator<=(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return compare(__other) <= 0;
+  }
+
+  template <size_t _OtherLen>
+  constexpr auto operator>=(const __mstring<_OtherLen>& __other) const noexcept -> bool
+  {
+    return compare(__other) >= 0;
+  }
+
+  size_t __len_;
+  char __what_[_Len];
+};
+
+template <size_t _Len>
+__mstring(const char (&__str)[_Len]) -> __mstring<_Len>;
+
+template <size_t _Len>
+__mstring(const char (&__str)[_Len], int) -> __mstring<__mpow2(_Len)>;
+
+template <class _Ty>
+constexpr auto __mnameof() noexcept
+{
+#if defined(_CCCL_COMPILER_MSVC)
+  return __mstring{__FUNCSIG__, 0};
+#else
+  return __mstring{__PRETTY_FUNCTION__, 0};
+#endif
+}
+
+// The following must be left undefined
+template <class...>
+struct _DIAGNOSTIC;
+
+struct _WHERE;
+
+struct _IN_ALGORITHM;
+
+struct _WHAT;
+
+struct _WITH_FUNCTION;
+
+struct _WITH_SENDER;
+
+struct _WITH_ARGUMENTS;
+
+struct _WITH_QUERY;
+
+struct _WITH_ENVIRONMENT;
+
+template <class>
+struct _WITH_COMPLETION_SIGNATURE;
+
+struct _FUNCTION_IS_NOT_CALLABLE;
+
+struct _UNKNOWN;
+
+struct _SENDER_HAS_TOO_MANY_SUCCESS_COMPLETIONS;
+
+template <class... _Sigs>
+struct _WITH_COMPLETIONS
+{};
+
+struct __merror_base
+{
+  constexpr friend bool __ustdex_unhandled_error(void*) noexcept
+  {
+    return true;
+  }
+};
+
+template <class... _What>
+struct _ERROR : __merror_base
+{
+  template <class...>
+  using __f = _ERROR;
+
+  _ERROR operator+();
+
+  template <class _Ty>
+  _ERROR& operator,(_Ty&);
+
+  template <class... _With>
+  _ERROR<_What..., _With...>& with(_ERROR<_With...>&);
+};
+
+constexpr bool __ustdex_unhandled_error(...) noexcept
+{
+  return false;
+}
+
+template <class _Ty>
+_CCCL_INLINE_VAR constexpr bool __is_error = false;
+
+template <class... _What>
+_CCCL_INLINE_VAR constexpr bool __is_error<_ERROR<_What...>> = true;
+
+template <class... _What>
+_CCCL_INLINE_VAR constexpr bool __is_error<_ERROR<_What...>&> = true;
+
+// True if any of the types in _Ts... are errors; false otherwise.
+template <class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __contains_error =
+#if defined(_CCCL_COMPILER_MSVC)
+  (__is_error<_Ts> || ...);
+#else
+  __ustdex_unhandled_error(static_cast<__mlist<_Ts...>*>(nullptr));
+#endif
+
+template <class... _Ts>
+using __find_error = decltype(+(__declval<_Ts&>(), ..., __declval<_ERROR<_UNKNOWN>&>()));
+
+template <template <class...> class _Fn, class... _Ts>
+using __minvoke_q = _Fn<_Ts...>;
+
+template <class _Fn, class... _Ts>
+using __minvoke = typename _Fn::template __f<_Ts...>;
+
+template <class _Fn, class _Ty>
+using __minvoke1 = typename _Fn::template __f<_Ty>;
+
+template <class _Fn, template <class...> class _Cy, class... _Ts, class _Ret = __minvoke<_Fn, _Ts...>>
+auto __apply_fn(_Cy<_Ts...>*) -> _Ret;
+
+template <template <class...> class _Fn, template <class...> class _Cy, class... _Ts, class _Ret = _Fn<_Ts...>>
+auto __apply_fn_q(_Cy<_Ts...>*) -> _Ret;
+
+template <class _Fn, class _List>
+using __mapply = decltype(__async::__apply_fn<_Fn>(static_cast<_List*>(nullptr)));
+
+template <template <class...> class _Fn, class _List>
+using __mapply_q = decltype(__async::__apply_fn_q<_Fn>(static_cast<_List*>(nullptr)));
+
+template <class _Ty, class...>
+using __mfront = _Ty;
+
+template <template <class...> class _Fn, class _List, class _Enable = void>
+_CCCL_INLINE_VAR constexpr bool __mvalid_ = false;
+
+template <template <class...> class _Fn, class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __mvalid_<_Fn, __mlist<_Ts...>, __mvoid<_Fn<_Ts...>>> = true;
+
+template <template <class...> class _Fn, class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __mvalid_q = __mvalid_<_Fn, __mlist<_Ts...>>;
+
+template <class _Fn, class... _Ts>
+_CCCL_INLINE_VAR constexpr bool __mvalid = __mvalid_<_Fn::template __f, __mlist<_Ts...>>;
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr auto __v = _Tp::__value;
+
+template <auto _Value>
+_CCCL_INLINE_VAR constexpr auto __v<__mvalue<_Value>> = _Value;
+
+template <bool _Value>
+_CCCL_INLINE_VAR constexpr auto __v<__mbool<_Value>> = _Value;
+
+template <class _Tp, _Tp _Value>
+_CCCL_INLINE_VAR constexpr auto __v<_CUDA_VSTD::integral_constant<_Tp, _Value>> = _Value;
+
+struct __midentity
+{
+  template <class _Ty>
+  using __f = _Ty;
+};
+
+template <class _Ty>
+struct __malways
+{
+  template <class...>
+  using __f = _Ty;
+};
+
+template <class _Ty>
+struct __malways1
+{
+  template <class>
+  using __f = _Ty;
+};
+
+template <bool>
+struct __mif_
+{
+  template <class _Then, class...>
+  using __f = _Then;
+};
+
+template <>
+struct __mif_<false>
+{
+  template <class _Then, class _Else>
+  using __f = _Else;
+};
+
+template <bool If, class _Then = void, class... _Else>
+using __mif = typename __mif_<If>::template __f<_Then, _Else...>;
+
+template <class If, class _Then = void, class... _Else>
+using __mif_t = typename __mif_<__v<If>>::template __f<_Then, _Else...>;
+
+template <bool _Error>
+struct __midentity_or_error_with_
+{
+  template <class _Ty, class... _With>
+  using __f = _Ty;
+};
+
+template <>
+struct __midentity_or_error_with_<true>
+{
+  template <class _Ty, class... _With>
+  using __f = decltype(__declval<_Ty&>().with(__declval<_ERROR<_With...>&>()));
+};
+
+template <class _Ty, class... _With>
+using __midentity_or_error_with = __minvoke<__midentity_or_error_with_<__is_error<_Ty>>, _Ty, _With...>;
+
+template <bool>
+struct __mtry_;
+
+template <>
+struct __mtry_<false>
+{
+  template <template <class...> class _Fn, class... _Ts>
+  using __g = _Fn<_Ts...>;
+
+  template <class _Fn, class... _Ts>
+  using __f = typename _Fn::template __f<_Ts...>;
+};
+
+template <>
+struct __mtry_<true>
+{
+  template <template <class...> class _Fn, class... _Ts>
+  using __g = __find_error<_Ts...>;
+
+  template <class _Fn, class... _Ts>
+  using __f = __find_error<_Fn, _Ts...>;
+};
+
+template <class _Fn, class... _Ts>
+using __mtry_invoke = typename __mtry_<__contains_error<_Ts...>>::template __f<_Fn, _Ts...>;
+
+template <template <class...> class _Fn, class... _Ts>
+using __mtry_invoke_q = typename __mtry_<__contains_error<_Ts...>>::template __g<_Fn, _Ts...>;
+
+template <class _Fn>
+struct __mtry
+{
+  template <class... _Ts>
+  using __f = __mtry_invoke<_Fn, _Ts...>;
+};
+
+template <class _Fn>
+struct __mpoly
+{
+  template <class... _Ts>
+  using __f = typename __mtry_<(sizeof...(_Ts) == ~0ul)>::template __f<_Fn, _Ts...>;
+};
+
+template <template <class...> class _Fn>
+struct __mpoly_q
+{
+  template <class... _Ts>
+  using __f = typename __mtry_<(sizeof...(_Ts) == ~0ul)>::template __g<_Fn, _Ts...>;
+};
+
+template <template <class...> class _Fn, class... _Default>
+struct __mquote;
+
+template <template <class...> class _Fn>
+struct __mquote<_Fn>
+{
+  template <class... _Ts>
+  using __f = _Fn<_Ts...>;
+};
+
+template <template <class...> class _Fn, class _Default>
+struct __mquote<_Fn, _Default>
+{
+  template <class... _Ts>
+  using __f = typename __mif<__mvalid_q<_Fn, _Ts...>, __mquote<_Fn>, __malways<_Default>>::template __f<_Ts...>;
+};
+
+template <template <class...> class _Fn, class... _Default>
+struct __mtry_quote;
+
+template <template <class...> class _Fn>
+struct __mtry_quote<_Fn>
+{
+  template <class... _Ts>
+  using __f = typename __mtry_<__contains_error<_Ts...>>::template __g<_Fn, _Ts...>;
+};
+
+template <template <class...> class _Fn, class _Default>
+struct __mtry_quote<_Fn, _Default>
+{
+  template <class... _Ts>
+  using __f = typename __mif<__mvalid_q<_Fn, _Ts...>, __mtry_quote<_Fn>, __malways<_Default>>::template __f<_Ts...>;
+};
+
+template <class _Fn, class... _Ts>
+struct __mbind_front
+{
+  template <class... _Us>
+  using __f = __minvoke<_Fn, _Ts..., _Us...>;
+};
+
+template <class _Fn, class _Ty>
+struct __mbind_front1
+{
+  template <class... _Us>
+  using __f = __minvoke<_Fn, _Ty, _Us...>;
+};
+
+template <template <class...> class _Fn, class... _Ts>
+struct __mbind_front_q
+{
+  template <class... _Us>
+  using __f = __minvoke_q<_Fn, _Ts..., _Us...>;
+};
+
+template <class _Fn, class... _Ts>
+struct __mbind_back
+{
+  template <class... _Us>
+  using __f = __minvoke<_Fn, _Us..., _Ts...>;
+};
+
+template <template <class...> class _Fn, class... _Ts>
+struct __mbind_back_q
+{
+  template <class... _Us>
+  using __f = __minvoke_q<_Fn, _Us..., _Ts...>;
+};
+
+#if defined(__cpp_pack_indexing)
+
+template <class _Np, class... _Ts>
+using __m_at = _Ts...[__v<_Np>];
+
+template <size_t _Np, class... _Ts>
+using __m_at_c = _Ts...[_Np];
+
+#elif __has_builtin(__type_pack_element)
+
+template <bool>
+struct __m_at_
+{
+  template <class _Np, class... _Ts>
+  using __f = __type_pack_element<__v<_Np>, _Ts...>;
+};
+
+template <class _Np, class... _Ts>
+using __m_at = __minvoke<__m_at_<__v<_Np> == ~0ul>, _Np, _Ts...>;
+
+template <size_t _Np, class... _Ts>
+using __m_at_c = __minvoke<__m_at_<_Np == ~0ul>, __mvalue<_Np>, _Ts...>;
+
+template <size_t _Idx>
+struct __mget
+{
+  template <class... _Ts>
+  using __f = __m_at<__mvalue<_Idx>, _Ts...>;
+};
+
+#else
+
+template <size_t _Idx>
+struct __mget
+{
+  template <class, class, class, class, class... _Ts>
+  using __f = __minvoke<__mtry_<sizeof...(_Ts) == ~0ull>, __mget<_Idx - 4>, _Ts...>;
+};
+
+template <>
+struct __mget<0>
+{
+  template <class _Ty, class...>
+  using __f = _Ty;
+};
+
+template <>
+struct __mget<1>
+{
+  template <class, class _Ty, class...>
+  using __f = _Ty;
+};
+
+template <>
+struct __mget<2>
+{
+  template <class, class, class _Ty, class...>
+  using __f = _Ty;
+};
+
+template <>
+struct __mget<3>
+{
+  template <class, class, class, class _Ty, class...>
+  using __f = _Ty;
+};
+
+template <class _Np, class... _Ts>
+using __m_at = __minvoke<__mget<__v<_Np>>, _Ts...>;
+
+template <size_t _Np, class... _Ts>
+using __m_at_c = __minvoke<__mget<_Np>, _Ts...>;
+
+#endif
+
+template <class _First, class _Second>
+struct __mpair
+{
+  using first  = _First;
+  using second = _Second;
+};
+
+template <class _Pair>
+using __mfirst = typename _Pair::first;
+
+template <class _Pair>
+using __msecond = typename _Pair::second;
+
+template <template <class...> class _Second, template <class...> class _First>
+struct __mcompose_q
+{
+  template <class... _Ts>
+  using __f = _Second<_First<_Ts...>>;
+};
+
+struct __mcount
+{
+  template <class... _Ts>
+  using __f = __mvalue<sizeof...(_Ts)>;
+};
+
+template <bool>
+struct __mconcat_
+{
+  template <class... _Ts,
+            template <class...> class _Ap = __mlist,
+            class... _As,
+            template <class...> class _Bp = __mlist,
+            class... _Bs,
+            template <class...> class _Cp = __mlist,
+            class... _Cs,
+            template <class...> class _Dp = __mlist,
+            class... _Ds,
+            class... _Tail>
+  static auto
+  __f(__mlist<_Ts...>*,
+      _Ap<_As...>*,
+      _Bp<_Bs...>* = nullptr,
+      _Cp<_Cs...>* = nullptr,
+      _Dp<_Ds...>* = nullptr,
+      _Tail*... __tail)
+    -> decltype(__mconcat_<(sizeof...(_Tail) == 0)>::__f(
+      static_cast<__mlist<_Ts..., _As..., _Bs..., _Cs..., _Ds...>*>(nullptr), __tail...));
+};
+
+template <>
+struct __mconcat_<true>
+{
+  template <class... _As>
+  static auto __f(__mlist<_As...>*) -> __mlist<_As...>;
+};
+
+template <class _Continuation = __mquote<__mlist>>
+struct __mconcat_into
+{
+  template <class... _Args>
+  using __f =
+    __mapply<_Continuation, decltype(__mconcat_<(sizeof...(_Args) == 0)>::__f({}, static_cast<_Args*>(nullptr)...))>;
+};
+
+template <template <class...> class _Continuation = __mlist>
+struct __mconcat_into_q
+{
+  template <class... _Args>
+  using __f =
+    __mapply_q<_Continuation, decltype(__mconcat_<(sizeof...(_Args) == 0)>::__f({}, static_cast<_Args*>(nullptr)...))>;
+};
+
+// The following must be super-fast to compile, so use an intrinsic directly if it is available
+#if defined(_LIBCUDACXX_IS_BASE_OF) && !defined(_LIBCUDACXX_USE_IS_BASE_OF_FALLBACK)
+
+template <class _Set, class... _Ty>
+_CCCL_INLINE_VAR constexpr bool __mset_contains = (_LIBCUDACXX_IS_BASE_OF(__mtype<_Ty>, _Set) && ...);
+
+#else
+
+template <class _Set, class... _Ty>
+_CCCL_INLINE_VAR constexpr bool __mset_contains = (_CUDA_VSTD::is_base_of_v<__mtype<_Ty>, _Set> && ...);
+
+#endif
+
+namespace __set
+{
+template <class... _Ts>
+struct __inherit
+{};
+
+template <class _Ty, class... _Ts>
+struct __inherit<_Ty, _Ts...>
+    : __mtype<_Ty>
+    , __inherit<_Ts...>
+{};
+
+template <class... _Set>
+auto operator+(__inherit<_Set...>&) -> __inherit<_Set...>;
+
+template <class... _Set, class _Ty>
+auto operator%(__inherit<_Set...>&, __mtype<_Ty>&) //
+  -> __mif< //
+    __mset_contains<__inherit<_Set...>, _Ty>,
+    __inherit<_Set...>,
+    __inherit<_Ty, _Set...>>&;
+
+template <class _ExpectedSet>
+struct __eq
+{
+  static constexpr size_t __count = __v<__mapply<__mcount, _ExpectedSet>>;
+
+  template <class... _Ts>
+  using __f = __mbool<sizeof...(_Ts) == __count && __mset_contains<_ExpectedSet, _Ts...>>;
+};
+} // namespace __set
+
+template <class... _Ts>
+using __mset = __set::__inherit<_Ts...>;
+
+template <class _Set, class... _Ts>
+using __mset_insert = decltype(+(__declval<_Set&>() % ... % __declval<__mtype<_Ts>&>()));
+
+template <class... _Ts>
+using __mmake_set = __mset_insert<__mset<>, _Ts...>;
+
+template <class _Set1, class _Set2>
+_CCCL_INLINE_VAR constexpr bool __mset_eq = __v<__mapply<__set::__eq<_Set1>, _Set2>>;
+
+template <class _Fn>
+struct __munique
+{
+  template <class... _Ts>
+  using __f = __minvoke<__mmake_set<_Ts...>, _Fn>;
+};
+
+template <class _Ty>
+struct __msingle_or
+{
+  template <class _Uy = _Ty>
+  using __f = _Uy;
+};
+} // namespace cuda::experimental::__async
+
+_CCCL_DIAG_POP
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/prologue.cuh b/cudax/include/cuda/experimental/__async/prologue.cuh
new file mode 100644
index 0000000000..179d4416d7
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/prologue.cuh
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#include <cuda/std/detail/__config>
+
+#if defined(_CUDAX_ASYNC_PROLOGUE_INCLUDED)
+#  __error multiple inclusion of prologue.cuh
+#endif
+
+#define _CUDAX_ASYNC_PROLOGUE_INCLUDED
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_GCC("-Wsubobject-linkage")
+_CCCL_DIAG_SUPPRESS_MSVC(4848) // [[no_unique_address]] prior to C++20 as a vendor extension
diff --git a/cudax/include/cuda/experimental/__async/queries.cuh b/cudax/include/cuda/experimental/__async/queries.cuh
new file mode 100644
index 0000000000..557c8d5b59
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/queries.cuh
@@ -0,0 +1,167 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_QUERIES
+#define __CUDAX_ASYNC_DETAIL_QUERIES
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__memory/allocator.h>
+
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/stop_token.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Ty, class _Query>
+auto __query_result_() -> decltype(__declval<_Ty>().__query(_Query()));
+
+template <class _Ty, class _Query>
+using __query_result_t = decltype(__query_result_<_Ty, _Query>());
+
+template <class _Ty, class _Query>
+_CCCL_INLINE_VAR constexpr bool __queryable = __mvalid_q<__query_result_t, _Ty, _Query>;
+
+#if defined(__CUDA_ARCH__)
+template <class _Ty, class _Query>
+_CCCL_INLINE_VAR constexpr bool __nothrow_queryable = true;
+#else
+template <class _Ty, class _Query>
+using __nothrow_queryable_ = __mif<noexcept(__declval<_Ty>().__query(_Query()))>;
+
+template <class _Ty, class _Query>
+_CCCL_INLINE_VAR constexpr bool __nothrow_queryable = __mvalid_q<__nothrow_queryable_, _Ty, _Query>;
+#endif
+
+_CCCL_GLOBAL_CONSTANT struct get_allocator_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+
+  _CCCL_HOST_DEVICE auto operator()(__ignore) const noexcept -> _CUDA_VSTD::allocator<void>
+  {
+    return {};
+  }
+} get_allocator{};
+
+_CCCL_GLOBAL_CONSTANT struct get_stop_token_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+
+  _CCCL_HOST_DEVICE auto operator()(__ignore) const noexcept -> never_stop_token
+  {
+    return {};
+  }
+} get_stop_token{};
+
+template <class _Ty>
+using stop_token_of_t = __decay_t<__call_result_t<get_stop_token_t, _Ty>>;
+
+template <class _Tag>
+struct get_completion_scheduler_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+};
+
+template <class _Tag>
+_CCCL_GLOBAL_CONSTANT get_completion_scheduler_t<_Tag> get_completion_scheduler{};
+
+_CCCL_GLOBAL_CONSTANT struct get_scheduler_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+} get_scheduler{};
+
+_CCCL_GLOBAL_CONSTANT struct get_delegatee_scheduler_t
+{
+  template <class _Env>
+  _CCCL_HOST_DEVICE auto operator()(const _Env& __env) const noexcept //
+    -> decltype(__env.__query(*this))
+  {
+    static_assert(noexcept(__env.__query(*this)));
+    return __env.__query(*this);
+  }
+} get_delegatee_scheduler{};
+
+enum class forward_progress_guarantee
+{
+  concurrent,
+  parallel,
+  weakly_parallel
+};
+
+_CCCL_GLOBAL_CONSTANT struct get_forward_progress_guarantee_t
+{
+  template <class _Sch>
+  _CCCL_HOST_DEVICE auto operator()(const _Sch& __sch) const noexcept //
+    -> decltype(__async::__decay_copy(__sch.__query(*this)))
+  {
+    static_assert(noexcept(__sch.__query(*this)));
+    return __sch.__query(*this);
+  }
+
+  _CCCL_HOST_DEVICE auto operator()(__ignore) const noexcept -> forward_progress_guarantee
+  {
+    return forward_progress_guarantee::weakly_parallel;
+  }
+} get_forward_progress_guarantee{};
+
+_CCCL_GLOBAL_CONSTANT struct get_domain_t
+{
+  template <class _Sch>
+  _CCCL_HOST_DEVICE constexpr auto operator()(const _Sch& __sch) const noexcept //
+    -> decltype(__async::__decay_copy(__sch.__query(*this)))
+  {
+    return {};
+  }
+} get_domain{};
+
+template <class _Sch>
+using domain_of_t = __call_result_t<get_domain_t, _Sch>;
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/rcvr_ref.cuh b/cudax/include/cuda/experimental/__async/rcvr_ref.cuh
new file mode 100644
index 0000000000..d380db189b
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/rcvr_ref.cuh
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_RCVR_REF
+#define __CUDAX_ASYNC_DETAIL_RCVR_REF
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+
+template <class _Rcvr>
+constexpr _Rcvr* __rcvr_ref(_Rcvr& __rcvr) noexcept
+{
+  return &__rcvr;
+}
+
+template <class _Rcvr>
+constexpr _Rcvr* __rcvr_ref(_Rcvr* __rcvr) noexcept
+{
+  return __rcvr;
+}
+
+template <class _Rcvr>
+using __rcvr_ref_t = decltype(__async::__rcvr_ref(__declval<_Rcvr>()));
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/rcvr_with_env.cuh b/cudax/include/cuda/experimental/__async/rcvr_with_env.cuh
new file mode 100644
index 0000000000..308d1f8d9b
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/rcvr_with_env.cuh
@@ -0,0 +1,141 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the _Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: _Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_RCVR_WITH_ENV
+#define __CUDAX_ASYNC_DETAIL_RCVR_WITH_ENV
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Rcvr, class _Env>
+struct __rcvr_with_env_t : _Rcvr
+{
+  using __env_t = __rcvr_with_env_t const&;
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto __rcvr() noexcept -> _Rcvr&
+  {
+    return *this;
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto __rcvr() const noexcept -> const _Rcvr&
+  {
+    return *this;
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto get_env() const noexcept -> __env_t
+  {
+    return __env_t{*this};
+  }
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr decltype(auto) __get_1st(_Query) const noexcept
+  {
+    if constexpr (__queryable<_Env, _Query>)
+    {
+      return (__env_);
+    }
+    else if constexpr (__queryable<env_of_t<_Rcvr>, _Query>)
+    {
+      return __async::get_env(static_cast<const _Rcvr&>(*this));
+    }
+  }
+
+  template <class _Query, class _Self = __rcvr_with_env_t>
+  using _1st_env_t = decltype(__declval<const _Self&>().__get_1st(_Query{}));
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query __query) const
+    noexcept(__nothrow_queryable<_1st_env_t<_Query>, _Query>) //
+    -> __query_result_t<_1st_env_t<_Query>, _Query>
+  {
+    return __get_1st(__query).__query(__query);
+  }
+
+  _Env __env_;
+};
+
+template <class _Rcvr, class _Env>
+struct __rcvr_with_env_t<_Rcvr*, _Env>
+{
+  using __env_t = __rcvr_with_env_t const&;
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto __rcvr() const noexcept -> _Rcvr*
+  {
+    return __rcvr_;
+  }
+
+  template <class... _As>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE void set_value(_As&&... __as) && noexcept
+  {
+    __async::set_value(__rcvr_, static_cast<_As&&>(__as)...);
+  }
+
+  template <class _Error>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE void set_error(_Error&& __error) && noexcept
+  {
+    __async::set_error(__rcvr_, static_cast<_Error&&>(__error));
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE void set_stopped() && noexcept
+  {
+    __async::set_stopped(__rcvr_);
+  }
+
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto get_env() const noexcept -> __env_t
+  {
+    return __env_t{*this};
+  }
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr decltype(auto) __get_1st(_Query) const noexcept
+  {
+    if constexpr (__queryable<_Env, _Query>)
+    {
+      return (__env_);
+    }
+    else if constexpr (__queryable<env_of_t<_Rcvr>, _Query>)
+    {
+      return __async::get_env(__rcvr_);
+    }
+  }
+
+  template <class _Query, class _Self = __rcvr_with_env_t>
+  using _1st_env_t = decltype(__declval<const _Self&>().__get_1st(_Query{}));
+
+  template <class _Query>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto query(_Query __query) const
+    noexcept(__nothrow_queryable<_1st_env_t<_Query>, _Query>) //
+    -> __query_result_t<_1st_env_t<_Query>, _Query>
+  {
+    return __get_1st(__query).__query(__query);
+  }
+
+  _Rcvr* __rcvr_;
+  _Env __env_;
+};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/read_env.cuh b/cudax/include/cuda/experimental/__async/read_env.cuh
new file mode 100644
index 0000000000..4f7848611c
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/read_env.cuh
@@ -0,0 +1,155 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_READ_ENV
+#define __CUDAX_ASYNC_DETAIL_READ_ENV
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct THE_CURRENT_ENVIRONMENT_LACKS_THIS_QUERY;
+
+struct read_env_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  template <class _Query, class _Env>
+  using __error_env_lacks_query = //
+    _ERROR<_WHERE(_IN_ALGORITHM, read_env_t),
+           _WHAT(THE_CURRENT_ENVIRONMENT_LACKS_THIS_QUERY),
+           _WITH_QUERY(_Query),
+           _WITH_ENVIRONMENT(_Env)>;
+
+  struct __completions_fn
+  {
+    template <class _Query, class _Env>
+    using __f = _CUDA_VSTD::_If<
+      __nothrow_callable<_Query, _Env>,
+      completion_signatures<set_value_t(__call_result_t<_Query, _Env>)>,
+      completion_signatures<set_value_t(__call_result_t<_Query, _Env>), set_error_t(::std::exception_ptr)>>;
+  };
+
+  template <class _Rcvr, class _Query>
+  struct __opstate_t
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = //
+      __minvoke<
+        _CUDA_VSTD::
+          _If<__callable<_Query, env_of_t<_Rcvr>>, __completions_fn, __error_env_lacks_query<_Query, env_of_t<_Rcvr>>>,
+        _Query,
+        env_of_t<_Rcvr>>;
+
+    _Rcvr __rcvr_;
+
+    _CCCL_HOST_DEVICE explicit __opstate_t(_Rcvr __rcvr)
+        : __rcvr_(static_cast<_Rcvr&&>(__rcvr))
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      // If the query invocation is noexcept, call it directly. Otherwise,
+      // wrap it in a try-catch block and forward the exception to the
+      // receiver.
+      if constexpr (__nothrow_callable<_Query, env_of_t<_Rcvr>>)
+      {
+        // This looks like a use after move, but `set_value` takes its
+        // arguments by forwarding reference, so it's safe.
+        __async::set_value(static_cast<_Rcvr&&>(__rcvr_), _Query()(__async::get_env(__rcvr_)));
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ( //
+            { //
+              __async::set_value(static_cast<_Rcvr&&>(__rcvr_), _Query()(__async::get_env(__rcvr_)));
+            }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __async::set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception());
+            }))
+      }
+    }
+  };
+
+  // This makes read_env a dependent sender:
+  template <class _Query>
+  struct __opstate_t<receiver_archetype, _Query>
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = dependent_completions;
+    _CCCL_HOST_DEVICE explicit __opstate_t(receiver_archetype);
+    _CCCL_HOST_DEVICE void start() noexcept;
+  };
+
+  template <class _Query>
+  struct __sndr_t;
+
+public:
+  /// @brief Returns a sender that, when connected to a receiver and started,
+  /// invokes the query with the receiver's environment and forwards the result
+  /// to the receiver's `set_value` member.
+  template <class _Query>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE constexpr __sndr_t<_Query> operator()(_Query) const noexcept;
+};
+
+template <class _Query>
+struct read_env_t::__sndr_t
+{
+  using sender_concept = sender_t;
+  _CCCL_NO_UNIQUE_ADDRESS read_env_t __tag;
+  _CCCL_NO_UNIQUE_ADDRESS _Query __query;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const noexcept(__nothrow_movable<_Rcvr>) -> __opstate_t<_Rcvr, _Query>
+  {
+    return __opstate_t<_Rcvr, _Query>{static_cast<_Rcvr&&>(__rcvr)};
+  }
+};
+
+template <class _Query>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE constexpr read_env_t::__sndr_t<_Query>
+read_env_t::operator()(_Query __query) const noexcept
+{
+  return __sndr_t<_Query>{{}, __query};
+}
+
+_CCCL_GLOBAL_CONSTANT read_env_t read_env{};
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/run_loop.cuh b/cudax/include/cuda/experimental/__async/run_loop.cuh
new file mode 100644
index 0000000000..4b2f61e9cd
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/run_loop.cuh
@@ -0,0 +1,274 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_RUN_LOOP
+#define __CUDAX_ASYNC_DETAIL_RUN_LOOP
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+// libcu++ does not have <cuda/std/mutex> or <cuda/std/condition_variable>
+#if !defined(__CUDA_ARCH__)
+
+#  include <cuda/experimental/__async/completion_signatures.cuh>
+#  include <cuda/experimental/__async/env.cuh>
+#  include <cuda/experimental/__async/exception.cuh>
+#  include <cuda/experimental/__async/queries.cuh>
+#  include <cuda/experimental/__async/utility.cuh>
+
+#  include <condition_variable>
+#  include <mutex>
+
+#  include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+class run_loop;
+
+struct __task : __immovable
+{
+  using __execute_fn_t = void(__task*) noexcept;
+
+  __task() = default;
+
+  _CCCL_HOST_DEVICE explicit __task(__task* __next, __task* __tail) noexcept
+      : __next_{__next}
+      , __tail_{__tail}
+  {}
+
+  _CCCL_HOST_DEVICE explicit __task(__task* __next, __execute_fn_t* __execute) noexcept
+      : __next_{__next}
+      , __execute_fn_{__execute}
+  {}
+
+  __task* __next_ = this;
+
+  union
+  {
+    __task* __tail_ = nullptr;
+    __execute_fn_t* __execute_fn_;
+  };
+
+  _CCCL_HOST_DEVICE void __execute() noexcept
+  {
+    (*__execute_fn_)(this);
+  }
+};
+
+template <class _Rcvr>
+struct __operation : __task
+{
+  run_loop* __loop_;
+  _CCCL_NO_UNIQUE_ADDRESS _Rcvr __rcvr_;
+
+  using completion_signatures = //
+    __async::completion_signatures<set_value_t(), set_error_t(::std::exception_ptr), set_stopped_t()>;
+
+  _CCCL_HOST_DEVICE static void __execute_impl(__task* __p) noexcept
+  {
+    auto& __rcvr = static_cast<__operation*>(__p)->__rcvr_;
+    _CUDAX_TRY( //
+      ({ //
+        if (get_stop_token(get_env(__rcvr)).stop_requested())
+        {
+          set_stopped(static_cast<_Rcvr&&>(__rcvr));
+        }
+        else
+        {
+          set_value(static_cast<_Rcvr&&>(__rcvr));
+        }
+      }),
+      _CUDAX_CATCH(...)( //
+        { //
+          set_error(static_cast<_Rcvr&&>(__rcvr), ::std::current_exception());
+        }))
+  }
+
+  _CCCL_HOST_DEVICE explicit __operation(__task* __tail_) noexcept
+      : __task{this, __tail_}
+  {}
+
+  _CCCL_HOST_DEVICE __operation(__task* __next_, run_loop* __loop, _Rcvr __rcvr)
+      : __task{__next_, &__execute_impl}
+      , __loop_{__loop}
+      , __rcvr_{static_cast<_Rcvr&&>(__rcvr)}
+  {}
+
+  _CCCL_HOST_DEVICE void start() & noexcept;
+};
+
+class run_loop
+{
+  template <class... _Ts>
+  using __completion_signatures = completion_signatures<_Ts...>;
+
+  template <class>
+  friend struct __operation;
+
+public:
+  run_loop() noexcept
+  {
+    __head.__next_ = __head.__tail_ = &__head;
+  }
+
+  class __scheduler
+  {
+    struct __schedule_task
+    {
+      using __t            = __schedule_task;
+      using __id           = __schedule_task;
+      using sender_concept = sender_t;
+
+      template <class _Rcvr>
+      _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const noexcept -> __operation<_Rcvr>
+      {
+        return {&__loop_->__head, __loop_, static_cast<_Rcvr&&>(__rcvr)};
+      }
+
+    private:
+      friend __scheduler;
+
+      struct __env
+      {
+        run_loop* __loop_;
+
+        template <class _Tag>
+        _CCCL_HOST_DEVICE auto query(get_completion_scheduler_t<_Tag>) const noexcept -> __scheduler
+        {
+          return __loop_->get_scheduler();
+        }
+      };
+
+      _CCCL_HOST_DEVICE auto get_env() const noexcept -> __env
+      {
+        return __env{__loop_};
+      }
+
+      _CCCL_HOST_DEVICE explicit __schedule_task(run_loop* __loop) noexcept
+          : __loop_(__loop)
+      {}
+
+      run_loop* const __loop_;
+    };
+
+    friend run_loop;
+
+    _CCCL_HOST_DEVICE explicit __scheduler(run_loop* __loop) noexcept
+        : __loop_(__loop)
+    {}
+
+    _CCCL_HOST_DEVICE auto query(get_forward_progress_guarantee_t) const noexcept -> forward_progress_guarantee
+    {
+      return forward_progress_guarantee::parallel;
+    }
+
+    run_loop* __loop_;
+
+  public:
+    using scheduler_concept = scheduler_t;
+
+    [[nodiscard]] _CCCL_HOST_DEVICE auto schedule() const noexcept -> __schedule_task
+    {
+      return __schedule_task{__loop_};
+    }
+
+    _CCCL_HOST_DEVICE friend bool operator==(const __scheduler& __a, const __scheduler& __b) noexcept
+    {
+      return __a.__loop_ == __b.__loop_;
+    }
+
+    _CCCL_HOST_DEVICE friend bool operator!=(const __scheduler& __a, const __scheduler& __b) noexcept
+    {
+      return __a.__loop_ != __b.__loop_;
+    }
+  };
+
+  _CCCL_HOST_DEVICE auto get_scheduler() noexcept -> __scheduler
+  {
+    return __scheduler{this};
+  }
+
+  _CCCL_HOST_DEVICE void run();
+
+  _CCCL_HOST_DEVICE void finish();
+
+private:
+  _CCCL_HOST_DEVICE void __push_back(__task* __tsk);
+  _CCCL_HOST_DEVICE auto __pop_front() -> __task*;
+
+  ::std::mutex __mutex{};
+  ::std::condition_variable __cv{};
+  __task __head{};
+  bool __stop = false;
+};
+
+template <class _Rcvr>
+_CCCL_HOST_DEVICE inline void __operation<_Rcvr>::start() & noexcept {
+  _CUDAX_TRY( //
+    ({ //
+      __loop_->__push_back(this); //
+    }), //
+    _CUDAX_CATCH(...)( //
+      { //
+        set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception()); //
+      })) //
+}
+
+_CCCL_HOST_DEVICE inline void run_loop::run()
+{
+  for (__task* __tsk = __pop_front(); __tsk != &__head; __tsk = __pop_front())
+  {
+    __tsk->__execute();
+  }
+}
+
+_CCCL_HOST_DEVICE inline void run_loop::finish()
+{
+  ::std::unique_lock __lock{__mutex};
+  __stop = true;
+  __cv.notify_all();
+}
+
+_CCCL_HOST_DEVICE inline void run_loop::__push_back(__task* __tsk)
+{
+  ::std::unique_lock __lock{__mutex};
+  __tsk->__next_ = &__head;
+  __head.__tail_ = __head.__tail_->__next_ = __tsk;
+  __cv.notify_one();
+}
+
+_CCCL_HOST_DEVICE inline auto run_loop::__pop_front() -> __task*
+{
+  ::std::unique_lock __lock{__mutex};
+  __cv.wait(__lock, [this] {
+    return __head.__next_ != &__head || __stop;
+  });
+  if (__head.__tail_ == __head.__next_)
+  {
+    __head.__tail_ = &__head;
+  }
+  return __async::__exchange(__head.__next_, __head.__next_->__next_);
+}
+} // namespace cuda::experimental::__async
+
+#  include <cuda/experimental/__async/epilogue.cuh>
+
+#endif // !defined(__CUDA_ARCH__)
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/sequence.cuh b/cudax/include/cuda/experimental/__async/sequence.cuh
new file mode 100644
index 0000000000..b7a85a7487
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/sequence.cuh
@@ -0,0 +1,151 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_SEQUENCE
+#define __CUDAX_ASYNC_DETAIL_SEQUENCE
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/lazy.cuh>
+#include <cuda/experimental/__async/rcvr_ref.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct __seq
+{
+  template <class _Rcvr, class _Sndr1, class _Sndr2>
+  struct __args
+  {
+    using __rcvr_t  = _Rcvr;
+    using __sndr1_t = _Sndr1;
+    using __sndr2_t = _Sndr2;
+  };
+
+  template <class _Zip>
+  struct __opstate
+  {
+    using operation_state_concept = operation_state_t;
+
+    using __args_t  = __unzip<_Zip>; // __unzip<_Zip> is __args<_Rcvr, _Sndr1, _Sndr2>
+    using __rcvr_t  = typename __args_t::__rcvr_t;
+    using __sndr1_t = typename __args_t::__sndr1_t;
+    using __sndr2_t = typename __args_t::__sndr2_t;
+
+    using completion_signatures = //
+      transform_completion_signatures_of< //
+        __sndr1_t,
+        __opstate*,
+        completion_signatures_of_t<__sndr2_t, __rcvr_ref_t<__rcvr_t&>>,
+        __malways<__async::completion_signatures<>>::__f>; // swallow the first sender's value completions
+
+    _CCCL_HOST_DEVICE friend env_of_t<__rcvr_t> get_env(const __opstate* __self) noexcept
+    {
+      return __async::get_env(__self->__rcvr_);
+    }
+
+    __rcvr_t __rcvr_;
+    connect_result_t<__sndr1_t, __opstate*> __opstate1_;
+    connect_result_t<__sndr2_t, __rcvr_ref_t<__rcvr_t&>> __opstate2_;
+
+    _CCCL_HOST_DEVICE __opstate(__sndr1_t&& __sndr1, __sndr2_t&& __sndr2, __rcvr_t&& __rcvr)
+        : __rcvr_(static_cast<__rcvr_t&&>(__rcvr))
+        , __opstate1_(__async::connect(static_cast<__sndr1_t&&>(__sndr1), this))
+        , __opstate2_(__async::connect(static_cast<__sndr2_t&&>(__sndr2), __rcvr_ref(__rcvr_)))
+    {}
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate1_);
+    }
+
+    template <class... _Values>
+    _CCCL_HOST_DEVICE void set_value(_Values&&...) && noexcept
+    {
+      __async::start(__opstate2_);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) && noexcept
+    {
+      __async::set_error(static_cast<__rcvr_t&&>(__rcvr_), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() && noexcept
+    {
+      __async::set_stopped(static_cast<__rcvr_t&&>(__rcvr_));
+    }
+  };
+
+  template <class _Sndr1, class _Sndr2>
+  struct __sndr_t;
+
+  template <class _Sndr1, class _Sndr2>
+  _CCCL_HOST_DEVICE auto operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const -> __sndr_t<_Sndr1, _Sndr2>;
+};
+
+template <class _Sndr1, class _Sndr2>
+struct __seq::__sndr_t
+{
+  using sender_concept = sender_t;
+  using __sndr1_t      = _Sndr1;
+  using __sndr2_t      = _Sndr2;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) &&
+  {
+    using __opstate_t = __opstate<__zip<__args<_Rcvr, _Sndr1, _Sndr2>>>;
+    return __opstate_t{static_cast<_Sndr1&&>(__sndr1_), static_cast<_Sndr2>(__sndr2_), static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const&
+  {
+    using __opstate_t = __opstate<__zip<__args<_Rcvr, const _Sndr1&, const _Sndr2&>>>;
+    return __opstate_t{__sndr1_, __sndr2_, static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  _CCCL_HOST_DEVICE env_of_t<_Sndr2> get_env() const noexcept
+  {
+    return __async::get_env(__sndr2_);
+  }
+
+  _CCCL_NO_UNIQUE_ADDRESS __seq __tag_;
+  _CCCL_NO_UNIQUE_ADDRESS __ignore __ign_;
+  __sndr1_t __sndr1_;
+  __sndr2_t __sndr2_;
+};
+
+template <class _Sndr1, class _Sndr2>
+_CCCL_HOST_DEVICE auto __seq::operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const -> __sndr_t<_Sndr1, _Sndr2>
+{
+  return __sndr_t<_Sndr1, _Sndr2>{{}, {}, static_cast<_Sndr1&&>(__sndr1), static_cast<_Sndr2&&>(__sndr2)};
+}
+
+using sequence_t = __seq;
+_CCCL_GLOBAL_CONSTANT sequence_t sequence{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/start_detached.cuh b/cudax/include/cuda/experimental/__async/start_detached.cuh
new file mode 100644
index 0000000000..e60c98d4a8
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/start_detached.cuh
@@ -0,0 +1,104 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_START_DETACHED
+#define __CUDAX_ASYNC_DETAIL_START_DETACHED
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__exception/terminate.h>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct start_detached_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  struct __opstate_base_t : __immovable
+  {};
+
+  struct __rcvr_t
+  {
+    using receiver_concept = receiver_t;
+
+    __opstate_base_t* __opstate_;
+    void (*__destroy)(__opstate_base_t*) noexcept;
+
+    template <class... _As>
+    void set_value(_As&&...) && noexcept
+    {
+      __destroy(__opstate_);
+    }
+
+    template <class _Error>
+    void set_error(_Error&&) && noexcept
+    {
+      ::cuda::std::terminate();
+    }
+
+    void set_stopped() && noexcept
+    {
+      __destroy(__opstate_);
+    }
+  };
+
+  template <class _Sndr>
+  struct __opstate_t : __opstate_base_t
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __async::completion_signatures_of_t<_Sndr, __rcvr_t>;
+    connect_result_t<_Sndr, __rcvr_t> __opstate_;
+
+    static void __destroy(__opstate_base_t* __ptr) noexcept
+    {
+      delete static_cast<__opstate_t*>(__ptr);
+    }
+
+    _CCCL_HOST_DEVICE explicit __opstate_t(_Sndr&& __sndr)
+        : __opstate_(__async::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{this, &__destroy}))
+    {}
+
+    _CCCL_HOST_DEVICE void start() & noexcept
+    {
+      __async::start(__opstate_);
+    }
+  };
+
+public:
+  /// @brief Eagerly connects and starts a sender and lets it
+  /// run detached.
+  template <class _Sndr>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void operator()(_Sndr __sndr) const
+  {
+    __async::start(*new __opstate_t<_Sndr>{static_cast<_Sndr&&>(__sndr)});
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT start_detached_t start_detached{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/start_on.cuh b/cudax/include/cuda/experimental/__async/start_on.cuh
new file mode 100644
index 0000000000..d3245817a1
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/start_on.cuh
@@ -0,0 +1,150 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_START_ON
+#define __CUDAX_ASYNC_DETAIL_START_ON
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/rcvr_with_env.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <class _Sch>
+struct __sch_env_t
+{
+  _Sch __sch_;
+
+  _Sch __query(get_scheduler_t) const noexcept
+  {
+    return __sch_;
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct start_on_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+
+  template <class _Rcvr, class _Sch, class _CvSndr>
+  struct __opstate_t
+  {
+    _CCCL_HOST_DEVICE friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept
+    {
+      return __async::get_env(__self->__env_rcvr_.__rcvr());
+    }
+
+    using operation_state_concept = operation_state_t;
+
+    using completion_signatures = //
+      transform_completion_signatures<
+        completion_signatures_of_t<_CvSndr, __rcvr_with_env_t<_Rcvr, __sch_env_t<_Sch>>*>,
+        transform_completion_signatures<completion_signatures_of_t<schedule_result_t<_Sch>, __opstate_t*>,
+                                        __async::completion_signatures<>,
+                                        __malways<__async::completion_signatures<>>::__f>>;
+
+    __rcvr_with_env_t<_Rcvr, __sch_env_t<_Sch>> __env_rcvr_;
+    connect_result_t<schedule_result_t<_Sch>, __opstate_t*> __opstate1_;
+    connect_result_t<_CvSndr, __rcvr_with_env_t<_Rcvr, __sch_env_t<_Sch>>*> __opstate2_;
+
+    _CCCL_HOST_DEVICE __opstate_t(_Sch __sch, _Rcvr __rcvr, _CvSndr&& __sndr)
+        : __env_rcvr_{static_cast<_Rcvr&&>(__rcvr), {__sch}}
+        , __opstate1_{connect(schedule(__env_rcvr_.__env_.__sch_), this)}
+        , __opstate2_{connect(static_cast<_CvSndr&&>(__sndr), &__env_rcvr_)}
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate1_);
+    }
+
+    _CCCL_HOST_DEVICE void set_value() noexcept
+    {
+      __async::start(__opstate2_);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __async::set_error(static_cast<_Rcvr&&>(__env_rcvr_.__rcvr()), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __async::set_stopped(static_cast<_Rcvr&&>(__env_rcvr_.__rcvr()));
+    }
+  };
+
+  template <class _Sch, class _Sndr>
+  struct __sndr_t;
+
+public:
+  template <class _Sch, class _Sndr>
+  _CCCL_HOST_DEVICE auto operator()(_Sch __sch, _Sndr __sndr) const noexcept //
+    -> __sndr_t<_Sch, _Sndr>;
+} start_on{};
+
+template <class _Sch, class _Sndr>
+struct start_on_t::__sndr_t
+{
+  using sender_concept = sender_t;
+  _CCCL_NO_UNIQUE_ADDRESS start_on_t __tag_;
+  _Sch __sch_;
+  _Sndr __sndr_;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && -> __opstate_t<_Rcvr, _Sch, _Sndr>
+  {
+    return __opstate_t<_Rcvr, _Sch, _Sndr>{__sch_, static_cast<_Rcvr&&>(__rcvr), static_cast<_Sndr&&>(__sndr_)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& -> __opstate_t<_Rcvr, _Sch, const _Sndr&>
+  {
+    return __opstate_t<_Rcvr, _Sch, const _Sndr&>{__sch_, static_cast<_Rcvr&&>(__rcvr), __sndr_};
+  }
+
+  _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+  {
+    return __async::get_env(__sndr_);
+  }
+};
+
+template <class _Sch, class _Sndr>
+_CCCL_HOST_DEVICE auto
+start_on_t::operator()(_Sch __sch, _Sndr __sndr) const noexcept -> start_on_t::__sndr_t<_Sch, _Sndr>
+{
+  return __sndr_t<_Sch, _Sndr>{{}, __sch, __sndr};
+}
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/stop_token.cuh b/cudax/include/cuda/experimental/__async/stop_token.cuh
new file mode 100644
index 0000000000..2a3e93a8d7
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/stop_token.cuh
@@ -0,0 +1,488 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_STOP_TOKEN
+#define __CUDAX_ASYNC_DETAIL_STOP_TOKEN
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_nothrow_constructible.h>
+#include <cuda/std/atomic>
+#include <cuda/std/detail/libcxx/include/__threading_support>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/thread.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#if __has_include(<stop_token>) && __cpp_lib_jthread >= 201911
+#  include <stop_token>
+#endif
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+// warning #20012-D: __device__ annotation is ignored on a
+// function("inplace_stop_source") that is explicitly defaulted on its first
+// declaration
+_CCCL_NV_DIAG_SUPPRESS(20012)
+
+namespace cuda::experimental::__async
+{
+// [stoptoken.inplace], class inplace_stop_token
+class inplace_stop_token;
+
+// [stopsource.inplace], class inplace_stop_source
+class inplace_stop_source;
+
+// [stopcallback.inplace], class template inplace_stop_callback
+template <class _Callback>
+class inplace_stop_callback;
+
+namespace __stok
+{
+struct __inplace_stop_callback_base
+{
+  _CCCL_HOST_DEVICE void __execute() noexcept
+  {
+    this->__execute_fn_(this);
+  }
+
+protected:
+  using __execute_fn_t = void(__inplace_stop_callback_base*) noexcept;
+
+  _CCCL_HOST_DEVICE explicit __inplace_stop_callback_base( //
+    const inplace_stop_source* __source, //
+    __execute_fn_t* __execute) noexcept
+      : __source_(__source)
+      , __execute_fn_(__execute)
+  {}
+
+  _CCCL_HOST_DEVICE void __register_callback() noexcept;
+
+  friend inplace_stop_source;
+
+  const inplace_stop_source* __source_;
+  __execute_fn_t* __execute_fn_;
+  __inplace_stop_callback_base* __next_      = nullptr;
+  __inplace_stop_callback_base** __prev_ptr_ = nullptr;
+  bool* __removed_during_callback_           = nullptr;
+  _CUDA_VSTD::atomic<bool> __callback_completed_{false};
+};
+
+struct __spin_wait
+{
+  __spin_wait() noexcept = default;
+
+  _CCCL_HOST_DEVICE void __wait() noexcept
+  {
+    if (__count_ == 0)
+    {
+      __async::__this_thread_yield();
+    }
+    else
+    {
+      --__count_;
+      _CUDA_VSTD::__libcpp_thread_yield_processor();
+    }
+  }
+
+private:
+  static constexpr uint32_t __yield_threshold = 20;
+  uint32_t __count_                           = __yield_threshold;
+};
+
+template <template <class> class>
+struct __check_type_alias_exists;
+} // namespace __stok
+
+// [stoptoken.never], class never_stop_token
+struct never_stop_token
+{
+private:
+  struct __callback_type
+  {
+    _CCCL_HOST_DEVICE explicit __callback_type(never_stop_token, __ignore) noexcept {}
+  };
+
+public:
+  template <class>
+  using callback_type = __callback_type;
+
+  _CCCL_HOST_DEVICE static constexpr auto stop_requested() noexcept -> bool
+  {
+    return false;
+  }
+
+  _CCCL_HOST_DEVICE static constexpr auto stop_possible() noexcept -> bool
+  {
+    return false;
+  }
+
+  _CCCL_HOST_DEVICE friend constexpr bool operator==(const never_stop_token&, const never_stop_token&) noexcept
+  {
+    return true;
+  }
+
+  _CCCL_HOST_DEVICE friend constexpr bool operator!=(const never_stop_token&, const never_stop_token&) noexcept
+  {
+    return false;
+  }
+};
+
+template <class _Callback>
+class inplace_stop_callback;
+
+// [stopsource.inplace], class inplace_stop_source
+class inplace_stop_source
+{
+public:
+  _CCCL_HOST_DEVICE inplace_stop_source() noexcept = default;
+  _CCCL_HOST_DEVICE ~inplace_stop_source();
+  _CUDAX_IMMOVABLE(inplace_stop_source);
+
+  _CCCL_HOST_DEVICE auto get_token() const noexcept -> inplace_stop_token;
+
+  _CCCL_HOST_DEVICE auto request_stop() noexcept -> bool;
+
+  _CCCL_HOST_DEVICE auto stop_requested() const noexcept -> bool
+  {
+    return (__state_.load(_CUDA_VSTD::memory_order_acquire) & __stop_requested_flag) != 0;
+  }
+
+private:
+  friend inplace_stop_token;
+  friend __stok::__inplace_stop_callback_base;
+  template <class>
+  friend class inplace_stop_callback;
+
+  _CCCL_HOST_DEVICE auto __lock() const noexcept -> uint8_t;
+  _CCCL_HOST_DEVICE void __unlock(uint8_t) const noexcept;
+
+  _CCCL_HOST_DEVICE auto __try_lock_unless_stop_requested(bool) const noexcept -> bool;
+
+  _CCCL_HOST_DEVICE auto __try_add_callback(__stok::__inplace_stop_callback_base*) const noexcept -> bool;
+
+  _CCCL_HOST_DEVICE void __remove_callback(__stok::__inplace_stop_callback_base*) const noexcept;
+
+  static constexpr uint8_t __stop_requested_flag = 1;
+  static constexpr uint8_t __locked_flag         = 2;
+
+  mutable _CUDA_VSTD::atomic<uint8_t> __state_{0};
+  mutable __stok::__inplace_stop_callback_base* __callbacks_ = nullptr;
+  __async::__thread_id __notifying_thread_;
+};
+
+// [stoptoken.inplace], class inplace_stop_token
+class inplace_stop_token
+{
+public:
+  template <class _Fun>
+  using callback_type = inplace_stop_callback<_Fun>;
+
+  _CCCL_HOST_DEVICE inplace_stop_token() noexcept
+      : __source_(nullptr)
+  {}
+
+  inplace_stop_token(const inplace_stop_token& __other) noexcept = default;
+
+  _CCCL_HOST_DEVICE inplace_stop_token(inplace_stop_token&& __other) noexcept
+      : __source_(__async::__exchange(__other.__source_, {}))
+  {}
+
+  auto operator=(const inplace_stop_token& __other) noexcept -> inplace_stop_token& = default;
+
+  _CCCL_HOST_DEVICE auto operator=(inplace_stop_token&& __other) noexcept -> inplace_stop_token&
+  {
+    __source_ = __async::__exchange(__other.__source_, nullptr);
+    return *this;
+  }
+
+  [[nodiscard]] _CCCL_HOST_DEVICE auto stop_requested() const noexcept -> bool
+  {
+    return __source_ != nullptr && __source_->stop_requested();
+  }
+
+  [[nodiscard]] _CCCL_HOST_DEVICE auto stop_possible() const noexcept -> bool
+  {
+    return __source_ != nullptr;
+  }
+
+  _CCCL_HOST_DEVICE void swap(inplace_stop_token& __other) noexcept
+  {
+    __async::__swap(__source_, __other.__source_);
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator==(const inplace_stop_token& __a, const inplace_stop_token& __b) noexcept
+  {
+    return __a.__source_ == __b.__source_;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const inplace_stop_token& __a, const inplace_stop_token& __b) noexcept
+  {
+    return __a.__source_ != __b.__source_;
+  }
+
+private:
+  friend inplace_stop_source;
+  template <class>
+  friend class inplace_stop_callback;
+
+  _CCCL_HOST_DEVICE explicit inplace_stop_token(const inplace_stop_source* __source) noexcept
+      : __source_(__source)
+  {}
+
+  const inplace_stop_source* __source_;
+};
+
+_CCCL_HOST_DEVICE inline auto inplace_stop_source::get_token() const noexcept -> inplace_stop_token
+{
+  return inplace_stop_token{this};
+}
+
+// [stopcallback.inplace], class template inplace_stop_callback
+template <class _Fun>
+class inplace_stop_callback : __stok::__inplace_stop_callback_base
+{
+public:
+  template <class _Fun2>
+  _CCCL_HOST_DEVICE explicit inplace_stop_callback(inplace_stop_token __token, _Fun2&& __fun) noexcept(
+    _CUDA_VSTD::is_nothrow_constructible_v<_Fun, _Fun2>)
+      : __stok::__inplace_stop_callback_base(__token.__source_, &inplace_stop_callback::__execute_impl)
+      , __fun(static_cast<_Fun2&&>(__fun))
+  {
+    __register_callback();
+  }
+
+  _CCCL_HOST_DEVICE ~inplace_stop_callback()
+  {
+    if (__source_ != nullptr)
+    {
+      __source_->__remove_callback(this);
+    }
+  }
+
+private:
+  _CCCL_HOST_DEVICE static void __execute_impl(__stok::__inplace_stop_callback_base* __cb) noexcept
+  {
+    static_cast<_Fun&&>(static_cast<inplace_stop_callback*>(__cb)->__fun)();
+  }
+
+  _CCCL_NO_UNIQUE_ADDRESS _Fun __fun;
+};
+
+namespace __stok
+{
+_CCCL_HOST_DEVICE inline void __inplace_stop_callback_base::__register_callback() noexcept
+{
+  if (__source_ != nullptr)
+  {
+    if (!__source_->__try_add_callback(this))
+    {
+      __source_ = nullptr;
+      // _Callback not registered because stop_requested() was true.
+      // Execute inline here.
+      __execute();
+    }
+  }
+}
+} // namespace __stok
+
+_CCCL_HOST_DEVICE inline inplace_stop_source::~inplace_stop_source()
+{
+  _LIBCUDACXX_ASSERT((__state_.load(_CUDA_VSTD::memory_order_relaxed) & __locked_flag) == 0, "");
+  _LIBCUDACXX_ASSERT(__callbacks_ == nullptr, "");
+}
+
+_CCCL_HOST_DEVICE inline auto inplace_stop_source::request_stop() noexcept -> bool
+{
+  if (!__try_lock_unless_stop_requested(true))
+  {
+    return true;
+  }
+
+  __notifying_thread_ = __async::__this_thread_id();
+
+  // We are responsible for executing callbacks.
+  while (__callbacks_ != nullptr)
+  {
+    auto* __callbk        = __callbacks_;
+    __callbk->__prev_ptr_ = nullptr;
+    __callbacks_          = __callbk->__next_;
+    if (__callbacks_ != nullptr)
+    {
+      __callbacks_->__prev_ptr_ = &__callbacks_;
+    }
+
+    __state_.store(__stop_requested_flag, _CUDA_VSTD::memory_order_release);
+
+    bool __removed_during_callback_      = false;
+    __callbk->__removed_during_callback_ = &__removed_during_callback_;
+
+    __callbk->__execute();
+
+    if (!__removed_during_callback_)
+    {
+      __callbk->__removed_during_callback_ = nullptr;
+      __callbk->__callback_completed_.store(true, _CUDA_VSTD::memory_order_release);
+    }
+
+    __lock();
+  }
+
+  __state_.store(__stop_requested_flag, _CUDA_VSTD::memory_order_release);
+  return false;
+}
+
+_CCCL_HOST_DEVICE inline auto inplace_stop_source::__lock() const noexcept -> uint8_t
+{
+  __stok::__spin_wait __spin;
+  auto __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed);
+  do
+  {
+    while ((__old_state & __locked_flag) != 0)
+    {
+      __spin.__wait();
+      __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed);
+    }
+  } while (!__state_.compare_exchange_weak(
+    __old_state, __old_state | __locked_flag, _CUDA_VSTD::memory_order_acquire, _CUDA_VSTD::memory_order_relaxed));
+
+  return __old_state;
+}
+
+_CCCL_HOST_DEVICE inline void inplace_stop_source::__unlock(uint8_t __old_state) const noexcept
+{
+  (void) __state_.store(__old_state, _CUDA_VSTD::memory_order_release);
+}
+
+_CCCL_HOST_DEVICE inline auto
+inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept -> bool
+{
+  __stok::__spin_wait __spin;
+  auto __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed);
+  do
+  {
+    while (true)
+    {
+      if ((__old_state & __stop_requested_flag) != 0)
+      {
+        // Stop already requested.
+        return false;
+      }
+      else if (__old_state == 0)
+      {
+        break;
+      }
+      else
+      {
+        __spin.__wait();
+        __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed);
+      }
+    }
+  } while (!__state_.compare_exchange_weak(
+    __old_state,
+    __set_stop_requested ? (__locked_flag | __stop_requested_flag) : __locked_flag,
+    _CUDA_VSTD::memory_order_acq_rel,
+    _CUDA_VSTD::memory_order_relaxed));
+
+  // Lock acquired successfully
+  return true;
+}
+
+_CCCL_HOST_DEVICE inline auto
+inplace_stop_source::__try_add_callback(__stok::__inplace_stop_callback_base* __callbk) const noexcept -> bool
+{
+  if (!__try_lock_unless_stop_requested(false))
+  {
+    return false;
+  }
+
+  __callbk->__next_     = __callbacks_;
+  __callbk->__prev_ptr_ = &__callbacks_;
+  if (__callbacks_ != nullptr)
+  {
+    __callbacks_->__prev_ptr_ = &__callbk->__next_;
+  }
+  __callbacks_ = __callbk;
+
+  __unlock(0);
+
+  return true;
+}
+
+_CCCL_HOST_DEVICE inline void
+inplace_stop_source::__remove_callback(__stok::__inplace_stop_callback_base* __callbk) const noexcept
+{
+  auto __old_state = __lock();
+
+  if (__callbk->__prev_ptr_ != nullptr)
+  {
+    // _Callback has not been executed yet.
+    // Remove from the list.
+    *__callbk->__prev_ptr_ = __callbk->__next_;
+    if (__callbk->__next_ != nullptr)
+    {
+      __callbk->__next_->__prev_ptr_ = __callbk->__prev_ptr_;
+    }
+    __unlock(__old_state);
+  }
+  else
+  {
+    auto __notifying_thread_ = this->__notifying_thread_;
+    __unlock(__old_state);
+
+    // _Callback has either already been executed or is
+    // currently executing on another thread.
+    if (__async::__this_thread_id() == __notifying_thread_)
+    {
+      if (__callbk->__removed_during_callback_ != nullptr)
+      {
+        *__callbk->__removed_during_callback_ = true;
+      }
+    }
+    else
+    {
+      // Concurrently executing on another thread.
+      // Wait until the other thread finishes executing the callback.
+      __stok::__spin_wait __spin;
+      while (!__callbk->__callback_completed_.load(_CUDA_VSTD::memory_order_acquire))
+      {
+        __spin.__wait();
+      }
+    }
+  }
+}
+
+struct __on_stop_request
+{
+  inplace_stop_source& __source_;
+
+  _CCCL_HOST_DEVICE void operator()() const noexcept
+  {
+    __source_.request_stop();
+  }
+};
+
+template <class _Token, class _Callback>
+using stop_callback_for_t = typename _Token::template callback_type<_Callback>;
+} // namespace cuda::experimental::__async
+
+_CCCL_NV_DIAG_DEFAULT(20012)
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/sync_wait.cuh b/cudax/include/cuda/experimental/__async/sync_wait.cuh
new file mode 100644
index 0000000000..f7191d792b
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/sync_wait.cuh
@@ -0,0 +1,207 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_SYNC_WAIT
+#define __CUDAX_ASYNC_DETAIL_SYNC_WAIT
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+// run_loop isn't supported on-device yet, so neither can sync_wait be.
+#if !defined(__CUDA_ARCH__)
+
+#  include <cuda/std/optional>
+#  include <cuda/std/tuple>
+
+#  include <cuda/experimental/__async/exception.cuh>
+#  include <cuda/experimental/__async/meta.cuh>
+#  include <cuda/experimental/__async/run_loop.cuh>
+#  include <cuda/experimental/__async/utility.cuh>
+
+#  include <system_error>
+
+#  include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+/// @brief Function object type for synchronously waiting for the result of a
+/// sender.
+struct sync_wait_t
+{
+#  if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#  endif // _CCCL_CUDA_COMPILER_NVCC
+  struct __env_t
+  {
+    run_loop* __loop_;
+
+    _CCCL_HOST_DEVICE auto query(get_scheduler_t) const noexcept
+    {
+      return __loop_->get_scheduler();
+    }
+
+    _CCCL_HOST_DEVICE auto query(get_delegatee_scheduler_t) const noexcept
+    {
+      return __loop_->get_scheduler();
+    }
+  };
+
+  template <class _Sndr>
+  struct __state_t
+  {
+    struct __rcvr_t
+    {
+      using receiver_concept = receiver_t;
+      __state_t* __state_;
+
+      template <class... _As>
+      _CCCL_HOST_DEVICE void set_value(_As&&... __as) noexcept
+      {
+        _CUDAX_TRY( //
+          ({ //
+            __state_->__values_->emplace(static_cast<_As&&>(__as)...);
+          }), //
+          _CUDAX_CATCH(...)( //
+            { //
+              __state_->__eptr_ = ::std::current_exception();
+            }))
+        __state_->__loop_.finish();
+      }
+
+      template <class _Error>
+      _CCCL_HOST_DEVICE void set_error(_Error __err) noexcept
+      {
+        if constexpr (_CUDA_VSTD::is_same_v<_Error, ::std::exception_ptr>)
+        {
+          __state_->__eptr_ = static_cast<_Error&&>(__err);
+        }
+        else if constexpr (_CUDA_VSTD::is_same_v<_Error, ::std::error_code>)
+        {
+          __state_->__eptr_ = ::std::make_exception_ptr(::std::system_error(__err));
+        }
+        else
+        {
+          __state_->__eptr_ = ::std::make_exception_ptr(static_cast<_Error&&>(__err));
+        }
+        __state_->__loop_.finish();
+      }
+
+      _CCCL_HOST_DEVICE void set_stopped() noexcept
+      {
+        __state_->__loop_.finish();
+      }
+
+      __env_t get_env() const noexcept
+      {
+        return __env_t{&__state_->__loop_};
+      }
+    };
+
+    using __values_t = value_types_of_t<_Sndr, __rcvr_t, _CUDA_VSTD::tuple, __midentity::__f>;
+
+    _CUDA_VSTD::optional<__values_t>* __values_;
+    ::std::exception_ptr __eptr_;
+    run_loop __loop_;
+  };
+
+  struct __invalid_sync_wait
+  {
+    const __invalid_sync_wait& value() const
+    {
+      return *this;
+    }
+
+    const __invalid_sync_wait& operator*() const
+    {
+      return *this;
+    }
+
+    int __i_;
+  };
+
+public:
+  // clang-format off
+    /// @brief Synchronously wait for the result of a sender, blocking the
+    ///         current thread.
+    ///
+    /// `sync_wait` connects and starts the given sender, and then drives a
+    ///         `run_loop` instance until the sender completes. Additional work
+    ///         can be delegated to the `run_loop` by scheduling work on the
+    ///         scheduler returned by calling `get_delegatee_scheduler` on the
+    ///         receiver's environment.
+    ///
+    /// @pre The sender must have a exactly one value completion signature. That
+    ///         is, it can only complete successfully in one way, with a single
+    ///         set of values.
+    ///
+    /// @retval success Returns an engaged `::std::optional` containing the result
+    ///         values in a `::std::tuple`.
+    /// @retval canceled Returns an empty `::std::optional`.
+    /// @retval error Throws the error.
+    ///
+    /// @throws ::std::rethrow_exception(error) if the error has type
+    ///         `::std::exception_ptr`.
+    /// @throws ::std::system_error(error) if the error has type
+    ///         `::std::error_code`.
+    /// @throws error otherwise
+  // clang-format on
+  template <class _Sndr>
+  auto operator()(_Sndr&& __sndr) const
+  {
+    using __rcvr_t      = typename __state_t<_Sndr>::__rcvr_t;
+    using __values_t    = typename __state_t<_Sndr>::__values_t;
+    using __completions = completion_signatures_of_t<_Sndr, __rcvr_t>;
+    static_assert(__is_completion_signatures<__completions>);
+
+    if constexpr (!__is_completion_signatures<__completions>)
+    {
+      return __invalid_sync_wait{0};
+    }
+    else
+    {
+      _CUDA_VSTD::optional<__values_t> __result{};
+      __state_t<_Sndr> __state{&__result};
+
+      // Launch the sender with a continuation that will fill in a variant
+      auto __opstate = __async::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{&__state});
+      __async::start(__opstate);
+
+      // Wait for the variant to be filled in, and process any work that
+      // may be delegated to this thread.
+      __state.__loop_.run();
+
+      if (__state.__eptr_)
+      {
+        ::std::rethrow_exception(__state.__eptr_);
+      }
+
+      return __result; // uses NRVO to "return" the result
+    }
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT sync_wait_t sync_wait{};
+} // namespace cuda::experimental::__async
+
+#  include <cuda/experimental/__async/epilogue.cuh>
+
+#endif // !defined(__CUDA_ARCH__)
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/then.cuh b/cudax/include/cuda/experimental/__async/then.cuh
new file mode 100644
index 0000000000..059f24e8ab
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/then.cuh
@@ -0,0 +1,303 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_THEN
+#define __CUDAX_ASYNC_DETAIL_THEN
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+// Forward-declate the then and upon_* algorithm tag types:
+struct then_t;
+struct upon_error_t;
+struct upon_stopped_t;
+
+// Map from a disposition to the corresponding tag types:
+namespace __detail
+{
+template <__disposition_t, class _Void = void>
+extern __undefined<_Void> __upon_tag;
+template <class _Void>
+extern __fn_t<then_t>* __upon_tag<__value, _Void>;
+template <class _Void>
+extern __fn_t<upon_error_t>* __upon_tag<__error, _Void>;
+template <class _Void>
+extern __fn_t<upon_stopped_t>* __upon_tag<__stopped, _Void>;
+} // namespace __detail
+
+namespace __upon
+{
+template <bool IsVoid, bool _Nothrow>
+struct __completion_fn
+{ // non-void, potentially throwing case
+  template <class _Result>
+  using __f = completion_signatures<set_value_t(_Result), set_error_t(::std::exception_ptr)>;
+};
+
+template <>
+struct __completion_fn<true, false>
+{ // void, potentially throwing case
+  template <class>
+  using __f = completion_signatures<set_value_t(), set_error_t(::std::exception_ptr)>;
+};
+
+template <>
+struct __completion_fn<false, true>
+{ // non-void, non-throwing case
+  template <class _Result>
+  using __f = completion_signatures<set_value_t(_Result)>;
+};
+
+template <>
+struct __completion_fn<true, true>
+{ // void, non-throwing case
+  template <class>
+  using __f = completion_signatures<set_value_t()>;
+};
+
+template <class _Result, bool _Nothrow>
+using __completion_ = __minvoke1<__completion_fn<_CUDA_VSTD::is_same_v<_Result, void>, _Nothrow>, _Result>;
+
+template <class _Fn, class... _Ts>
+using __completion = __completion_<__call_result_t<_Fn, _Ts...>, __nothrow_callable<_Fn, _Ts...>>;
+} // namespace __upon
+
+template <__disposition_t _Disposition>
+struct __upon_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  using _UponTag = decltype(__detail::__upon_tag<_Disposition>());
+  using _SetTag  = decltype(__detail::__set_tag<_Disposition>());
+
+  template <class _Fn, class... _Ts>
+  using __error_not_callable = //
+    _ERROR< //
+      _WHERE(_IN_ALGORITHM, _UponTag),
+      _WHAT(_FUNCTION_IS_NOT_CALLABLE),
+      _WITH_FUNCTION(_Fn),
+      _WITH_ARGUMENTS(_Ts...)>;
+
+  template <class _Fn>
+  struct __transform_completion
+  {
+    template <class... _Ts>
+    using __f = __minvoke<__mtry_quote<__upon::__completion, __error_not_callable<_Fn, _Ts...>>, _Fn, _Ts...>;
+  };
+
+  template <class _CvSndr, class _Fn, class _Rcvr>
+  using __completions =
+    __gather_completion_signatures<completion_signatures_of_t<_CvSndr, _Rcvr>,
+                                   _SetTag,
+                                   __transform_completion<_Fn>::template __f,
+                                   __default_completions,
+                                   __mtry_quote<__concat_completion_signatures>::__f>;
+
+  template <class _Rcvr, class _CvSndr, class _Fn>
+  struct __opstate_t
+  {
+    _CCCL_HOST_DEVICE friend env_of_t<_Rcvr> get_env(const __opstate_t* __self) noexcept
+    {
+      return __async::get_env(__self->__rcvr_);
+    }
+
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = __completions<_CvSndr, _Fn, __opstate_t*>;
+
+    _Rcvr __rcvr_;
+    _Fn __fn_;
+    connect_result_t<_CvSndr, __opstate_t*> __opstate_;
+
+    _CCCL_HOST_DEVICE __opstate_t(_CvSndr&& __sndr, _Rcvr __rcvr, _Fn __fn)
+        : __rcvr_{static_cast<_Rcvr&&>(__rcvr)}
+        , __fn_{static_cast<_Fn&&>(__fn)}
+        , __opstate_{__async::connect(static_cast<_CvSndr&&>(__sndr), this)}
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() & noexcept
+    {
+      __async::start(__opstate_);
+    }
+
+    template <bool _CanThrow = false, class... _Ts>
+    _CCCL_HOST_DEVICE void __set(_Ts&&... __ts) noexcept(!_CanThrow)
+    {
+      if constexpr (_CanThrow || __nothrow_callable<_Fn, _Ts...>)
+      {
+        if constexpr (_CUDA_VSTD::is_same_v<void, __call_result_t<_Fn, _Ts...>>)
+        {
+          static_cast<_Fn&&>(__fn_)(static_cast<_Ts&&>(__ts)...);
+          __async::set_value(static_cast<_Rcvr&&>(__rcvr_));
+        }
+        else
+        {
+          __async::set_value(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Fn&&>(__fn_)(static_cast<_Ts&&>(__ts)...));
+        }
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ({ //
+            __set<true>(static_cast<_Ts&&>(__ts)...); //
+          }), //
+          _CUDAX_CATCH(...)( //
+            { //
+              __async::set_error(static_cast<_Rcvr&&>(__rcvr_), ::std::current_exception());
+            }))
+      }
+    }
+
+    template <class _Tag, class... _Ts>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void __complete(_Tag, _Ts&&... __ts) noexcept
+    {
+      if constexpr (_CUDA_VSTD::is_same_v<_Tag, _SetTag>)
+      {
+        __set(static_cast<_Ts&&>(__ts)...);
+      }
+      else
+      {
+        _Tag()(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Ts&&>(__ts)...);
+      }
+    }
+
+    template <class... _Ts>
+    _CCCL_HOST_DEVICE void set_value(_Ts&&... __ts) noexcept
+    {
+      __complete(set_value_t(), static_cast<_Ts&&>(__ts)...);
+    }
+
+    template <class _Error>
+    _CCCL_HOST_DEVICE void set_error(_Error&& __error) noexcept
+    {
+      __complete(set_error_t(), static_cast<_Error&&>(__error));
+    }
+
+    _CCCL_HOST_DEVICE void set_stopped() noexcept
+    {
+      __complete(set_stopped_t());
+    }
+  };
+
+  template <class _Fn, class _Sndr>
+  struct __sndr_t
+  {
+    using sender_concept = sender_t;
+    _CCCL_NO_UNIQUE_ADDRESS _UponTag __tag_;
+    _Fn __fn_;
+    _Sndr __sndr_;
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && //
+      noexcept(__nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Rcvr, _Fn>) //
+      -> __opstate_t<_Rcvr, _Sndr, _Fn>
+    {
+      return __opstate_t<_Rcvr, _Sndr, _Fn>{
+        static_cast<_Sndr&&>(__sndr_), static_cast<_Rcvr&&>(__rcvr), static_cast<_Fn&&>(__fn_)};
+    }
+
+    template <class _Rcvr>
+    _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& //
+      noexcept(__nothrow_constructible<__opstate_t<_Rcvr, const _Sndr&, _Fn>,
+                                       const _Sndr&,
+                                       _Rcvr,
+                                       const _Fn&>) //
+      -> __opstate_t<_Rcvr, const _Sndr&, _Fn>
+    {
+      return __opstate_t<_Rcvr, const _Sndr&, _Fn>{__sndr_, static_cast<_Rcvr&&>(__rcvr), __fn_};
+    }
+
+    _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+    {
+      return __async::get_env(__sndr_);
+    }
+  };
+
+  template <class _Fn>
+  struct __closure_t
+  {
+    using _UponTag = decltype(__detail::__upon_tag<_Disposition>());
+    _Fn __fn_;
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE auto operator()(_Sndr __sndr) -> __call_result_t<_UponTag, _Sndr, _Fn>
+    {
+      return _UponTag()(static_cast<_Sndr&&>(__sndr), static_cast<_Fn&&>(__fn_));
+    }
+
+    template <class _Sndr>
+    _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE friend auto operator|(_Sndr __sndr, __closure_t&& __self) //
+      -> __call_result_t<_UponTag, _Sndr, _Fn>
+    {
+      return _UponTag()(static_cast<_Sndr&&>(__sndr), static_cast<_Fn&&>(__self.__fn_));
+    }
+  };
+
+public:
+  template <class _Sndr, class _Fn>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Sndr __sndr, _Fn __fn) const noexcept //
+    -> __sndr_t<_Fn, _Sndr>
+  {
+    // If the incoming sender is non-dependent, we can check the completion
+    // signatures of the composed sender immediately.
+    if constexpr (__is_non_dependent_sender<_Sndr>)
+    {
+      using __completions = completion_signatures_of_t<__sndr_t<_Fn, _Sndr>>;
+      static_assert(__is_completion_signatures<__completions>);
+    }
+    return __sndr_t<_Fn, _Sndr>{{}, static_cast<_Fn&&>(__fn), static_cast<_Sndr&&>(__sndr)};
+  }
+
+  template <class _Fn>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE auto operator()(_Fn __fn) const noexcept
+  {
+    return __closure_t<_Fn>{static_cast<_Fn&&>(__fn)};
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT struct then_t : __upon_t<__value>
+{
+} then{};
+
+_CCCL_GLOBAL_CONSTANT struct upon_error_t : __upon_t<__error>
+{
+} upon_error{};
+
+_CCCL_GLOBAL_CONSTANT struct upon_stopped_t : __upon_t<__stopped>
+{
+} upon_stopped{};
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/thread.cuh b/cudax/include/cuda/experimental/__async/thread.cuh
new file mode 100644
index 0000000000..d048bdc34d
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/thread.cuh
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_THREAD
+#define __CUDAX_ASYNC_DETAIL_THREAD
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+#include <thread>
+
+#if defined(__CUDACC__)
+#  include <nv/target>
+#  define _CUDAX_FOR_HOST_OR_DEVICE(_FOR_HOST, _FOR_DEVICE) NV_IF_TARGET(NV_IS_HOST, _FOR_HOST, _FOR_DEVICE)
+#else
+#  define _CUDAX_FOR_HOST_OR_DEVICE(_FOR_HOST, _FOR_DEVICE) {_NV_EVAL _FOR_HOST}
+#endif
+
+namespace cuda::experimental::__async
+{
+#if defined(__CUDA_ARCH__)
+using __thread_id = int;
+#elif defined(_CCCL_COMPILER_NVHPC)
+struct __thread_id
+{
+  union
+  {
+    ::std::thread::id __host_;
+    int __device_;
+  };
+
+  _CCCL_HOST_DEVICE __thread_id() noexcept
+      : __host_()
+  {}
+  _CCCL_HOST_DEVICE __thread_id(::std::thread::id __host) noexcept
+      : __host_(__host)
+  {}
+  _CCCL_HOST_DEVICE __thread_id(int __device) noexcept
+      : __device_(__device)
+  {}
+
+  _CCCL_HOST_DEVICE friend bool operator==(const __thread_id& __self, const __thread_id& __other) noexcept
+  {
+    _CUDAX_FOR_HOST_OR_DEVICE((return __self.__host_ == __other.__host_;),
+                              (return __self.__device_ == __other.__device_;))
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const __thread_id& __self, const __thread_id& __other) noexcept
+  {
+    return !(__self == __other);
+  }
+};
+#else
+using __thread_id = ::std::thread::id;
+#endif
+
+inline _CCCL_HOST_DEVICE __thread_id __this_thread_id() noexcept
+{
+  _CUDAX_FOR_HOST_OR_DEVICE((return ::std::this_thread::get_id();),
+                            (return static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);))
+}
+
+inline _CCCL_HOST_DEVICE void __this_thread_yield() noexcept
+{
+  _CUDAX_FOR_HOST_OR_DEVICE((::std::this_thread::yield();), (void();))
+}
+} // namespace cuda::experimental::__async
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/thread_context.cuh b/cudax/include/cuda/experimental/__async/thread_context.cuh
new file mode 100644
index 0000000000..1ca98a0417
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/thread_context.cuh
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_THREAD_CONTEXT
+#define __CUDAX_ASYNC_DETAIL_THREAD_CONTEXT
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+
+#if !defined(__CUDA_ARCH__)
+
+#  include <cuda/experimental/__async/run_loop.cuh>
+
+#  include <thread>
+
+#  include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct thread_context
+{
+  thread_context() noexcept
+      : __thrd_{[this] {
+        __loop_.run();
+      }}
+  {}
+
+  ~thread_context() noexcept
+  {
+    join();
+  }
+
+  void join() noexcept
+  {
+    if (__thrd_.joinable())
+    {
+      __loop_.finish();
+      __thrd_.join();
+    }
+  }
+
+  auto get_scheduler()
+  {
+    return __loop_.get_scheduler();
+  }
+
+private:
+  run_loop __loop_;
+  ::std::thread __thrd_;
+};
+} // namespace cuda::experimental::__async
+
+#  include <cuda/experimental/__async/epilogue.cuh>
+
+#endif // !defined(__CUDA_ARCH__)
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/tuple.cuh b/cudax/include/cuda/experimental/__async/tuple.cuh
new file mode 100644
index 0000000000..7704728781
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/tuple.cuh
@@ -0,0 +1,104 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_TUPLE
+#define __CUDAX_ASYNC_DETAIL_TUPLE
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+template <size_t _Idx, class _Ty>
+struct __box
+{
+  // Too many compiler bugs with [[no_unique_address]] to use it here.
+  // E.g., https://github.com/llvm/llvm-project/issues/88077
+  // _CCCL_NO_UNIQUE_ADDRESS
+  _Ty __value_;
+};
+
+template <size_t _Idx, class _Ty>
+_CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE constexpr auto __cget(__box<_Idx, _Ty> const& __box) noexcept -> _Ty const&
+{
+  return __box.__value_;
+}
+
+template <class _Idx, class... _Ts>
+struct __tupl;
+
+template <size_t... _Idx, class... _Ts>
+struct __tupl<__mindices<_Idx...>, _Ts...> : __box<_Idx, _Ts>...
+{
+  template <class _Fn, class _Self, class... _Us>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE static auto __apply(_Fn&& __fn, _Self&& __self, _Us&&... __us) //
+    noexcept(noexcept(static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)...,
+                                               static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_...)))
+      -> decltype(static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)...,
+                                           static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_...))
+  {
+    return static_cast<_Fn&&>(
+      __fn)(static_cast<_Us&&>(__us)..., static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_...);
+  }
+
+  template <class _Fn, class _Self, class... _Us>
+  _CUDAX_ALWAYS_INLINE _CCCL_HOST_DEVICE static auto __for_each(_Fn&& __fn, _Self&& __self, _Us&&... __us) //
+    noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>>
+              && ...)) -> __mif<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)>
+  {
+    return (
+      static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)..., static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_),
+      ...);
+  }
+};
+
+template <class... _Ts>
+_CCCL_HOST_DEVICE __tupl(_Ts...) //
+  -> __tupl<__mmake_indices<sizeof...(_Ts)>, _Ts...>;
+
+template <class _Fn, class _Tupl, class... _Us>
+using __apply_result_t =
+  decltype(__declval<_Tupl>().__apply(__declval<_Fn>(), __declval<_Tupl>(), __declval<_Us>()...));
+
+#if defined(_CCCL_COMPILER_MSVC)
+template <class... _Ts>
+struct __mk_tuple_
+{
+  using __indices_t = __mmake_indices<sizeof...(_Ts)>;
+  using type        = __tupl<__indices_t, _Ts...>;
+};
+
+template <class... _Ts>
+using __tuple = __t<__mk_tuple_<_Ts...>>;
+#else
+template <class... _Ts>
+using __tuple = __tupl<__mmake_indices<sizeof...(_Ts)>, _Ts...>;
+#endif
+
+template <class... _Ts>
+using __decayed_tuple = __tuple<__decay_t<_Ts>...>;
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/type_traits.cuh b/cudax/include/cuda/experimental/__async/type_traits.cuh
new file mode 100644
index 0000000000..2fcdb78fde
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/type_traits.cuh
@@ -0,0 +1,258 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_TYPE_TRAITS
+#define __CUDAX_ASYNC_DETAIL_TYPE_TRAITS
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/remove_reference.h>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+#if __has_builtin(__remove_reference)
+
+template <class _Ty>
+using __remove_ref_t = __remove_reference(_Ty);
+
+#elif __has_builtin(__remove_reference_t)
+
+template <class _Ty>
+using __remove_ref_t = __remove_reference_t(_Ty);
+
+#else
+
+template <class _Ty>
+using __remove_ref_t = _CUDA_VSTD::remove_reference_t<_Ty>;
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// __decay_t: An efficient implementation for ::std::decay
+#if __has_builtin(__decay)
+
+template <class _Ty>
+using __decay_t = __decay(_Ty);
+
+// #elif defined(_CCCL_COMPILER_NVHPC)
+
+//   template <class _Ty>
+//   using __decay_t = _CUDA_VSTD::decay_t<_Ty>;
+
+#else
+
+struct __decay_object
+{
+  template <class _Ty>
+  static _Ty __g(_Ty const&);
+  template <class _Ty>
+  using __f = decltype(__g(__declval<_Ty>()));
+};
+
+struct __decay_default
+{
+  template <class _Ty>
+  static _Ty __g(_Ty);
+  template <class _Ty>
+  using __f = decltype(__g(__declval<_Ty>()));
+};
+
+// I don't care to support abominable function types,
+// but if that's needed, this is the way to do it:
+// struct __decay_abominable {
+//   template <class _Ty>
+//   using __f = _Ty;
+// };
+
+struct __decay_void
+{
+  template <class _Ty>
+  using __f = void;
+};
+
+template <class _Ty>
+extern __decay_object __mdecay;
+
+template <class _Ty, class... _Us>
+extern __decay_default __mdecay<_Ty(_Us...)>;
+
+template <class _Ty, class... _Us>
+extern __decay_default __mdecay<_Ty(_Us...) noexcept>;
+
+template <class _Ty, class... _Us>
+extern __decay_default __mdecay<_Ty (&)(_Us...)>;
+
+template <class _Ty, class... _Us>
+extern __decay_default __mdecay<_Ty (&)(_Us...) noexcept>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const noexcept>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const &>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const & noexcept>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const &&>;
+
+// template <class _Ty, class... _Us>
+// extern __decay_abominable __mdecay<_Ty(_Us...) const && noexcept>;
+
+template <class _Ty>
+extern __decay_default __mdecay<_Ty[]>;
+
+template <class _Ty, size_t _Ny>
+extern __decay_default __mdecay<_Ty[_Ny]>;
+
+template <class _Ty, size_t _Ny>
+extern __decay_default __mdecay<_Ty (&)[_Ny]>;
+
+template <>
+inline __decay_void __mdecay<void>;
+
+template <>
+inline __decay_void __mdecay<void const>;
+
+template <class _Ty>
+using __decay_t = typename decltype(__mdecay<_Ty>)::template __f<_Ty>;
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// __copy_cvref_t: For copying cvref from one type to another
+struct __cp
+{
+  template <class _Tp>
+  using __f = _Tp;
+};
+
+struct __cpc
+{
+  template <class _Tp>
+  using __f = const _Tp;
+};
+
+struct __cplr
+{
+  template <class _Tp>
+  using __f = _Tp&;
+};
+
+struct __cprr
+{
+  template <class _Tp>
+  using __f = _Tp&&;
+};
+
+struct __cpclr
+{
+  template <class _Tp>
+  using __f = const _Tp&;
+};
+
+struct __cpcrr
+{
+  template <class _Tp>
+  using __f = const _Tp&&;
+};
+
+template <class>
+extern __cp __cpcvr;
+template <class _Tp>
+extern __cpc __cpcvr<const _Tp>;
+template <class _Tp>
+extern __cplr __cpcvr<_Tp&>;
+template <class _Tp>
+extern __cprr __cpcvr<_Tp&&>;
+template <class _Tp>
+extern __cpclr __cpcvr<const _Tp&>;
+template <class _Tp>
+extern __cpcrr __cpcvr<const _Tp&&>;
+template <class _Tp>
+using __copy_cvref_fn = decltype(__cpcvr<_Tp>);
+
+template <class _From, class _To>
+using __copy_cvref_t = typename __copy_cvref_fn<_From>::template __f<_To>;
+
+template <class _Fn, class... _As>
+using __call_result_t = decltype(__declval<_Fn>()(__declval<_As>()...));
+
+template <class _Fn, class... _As>
+_CCCL_INLINE_VAR constexpr bool __callable = __mvalid_q<__call_result_t, _Fn, _As...>;
+
+#if defined(__CUDA_ARCH__)
+template <class _Fn, class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_callable = true;
+
+template <class _Ty, class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_constructible = true;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_decay_copyable = true;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_movable = true;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_copyable = true;
+#else
+template <class _Fn, class... _As>
+using __nothrow_callable_ = __mif<noexcept(__declval<_Fn>()(__declval<_As>()...))>;
+
+template <class _Fn, class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_callable = __mvalid_q<__nothrow_callable_, _Fn, _As...>;
+
+template <class _Ty, class... _As>
+using __nothrow_constructible_ = __mif<noexcept(_Ty{__declval<_As>()...})>;
+
+template <class _Ty, class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_constructible = __mvalid_q<__nothrow_constructible_, _Ty, _As...>;
+
+template <class _Ty>
+using __nothrow_decay_copyable_ = __mif<noexcept(__decay_t<_Ty>(__declval<_Ty>()))>;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_decay_copyable = (__mvalid_q<__nothrow_decay_copyable_, _As> && ...);
+
+template <class _Ty>
+using __nothrow_movable_ = __mif<noexcept(_Ty(__declval<_Ty>()))>;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_movable = (__mvalid_q<__nothrow_movable_, _As> && ...);
+
+template <class _Ty>
+using __nothrow_copyable_ = __mif<noexcept(_Ty(__declval<const _Ty&>()))>;
+
+template <class... _As>
+_CCCL_INLINE_VAR constexpr bool __nothrow_copyable = (__mvalid_q<__nothrow_copyable_, _As> && ...);
+#endif
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/utility.cuh b/cudax/include/cuda/experimental/__async/utility.cuh
new file mode 100644
index 0000000000..4018610da9
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/utility.cuh
@@ -0,0 +1,208 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_UTILITY
+#define __CUDAX_ASYNC_DETAIL_UTILITY
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/initializer_list>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+_CCCL_GLOBAL_CONSTANT size_t __npos = ~0UL;
+
+struct __ignore
+{
+  template <class... _As>
+  _CCCL_HOST_DEVICE constexpr __ignore(_As&&...) noexcept {};
+};
+
+template <class...>
+struct __undefined;
+
+struct __empty
+{};
+
+struct [[deprecated]] __deprecated
+{};
+
+struct __nil
+{};
+
+struct __immovable
+{
+  __immovable() = default;
+  _CUDAX_IMMOVABLE(__immovable);
+};
+
+_CCCL_HOST_DEVICE constexpr size_t __maximum(_CUDA_VSTD::initializer_list<size_t> __il) noexcept
+{
+  size_t __max = 0;
+  for (auto i : __il)
+  {
+    if (i > __max)
+    {
+      __max = i;
+    }
+  }
+  return __max;
+}
+
+_CCCL_HOST_DEVICE constexpr size_t __find_pos(bool const* const __begin, bool const* const __end) noexcept
+{
+  for (bool const* __where = __begin; __where != __end; ++__where)
+  {
+    if (*__where)
+    {
+      return static_cast<size_t>(__where - __begin);
+    }
+  }
+  return __npos;
+}
+
+template <class _Ty, class... _Ts>
+_CCCL_HOST_DEVICE constexpr size_t __index_of() noexcept
+{
+  constexpr bool __same[] = {_CUDA_VSTD::is_same_v<_Ty, _Ts>...};
+  return __async::__find_pos(__same, __same + sizeof...(_Ts));
+}
+
+template <class _Ty, class _Uy = _Ty>
+_CCCL_HOST_DEVICE constexpr _Ty __exchange(_Ty& __obj, _Uy&& __new_value) noexcept
+{
+  constexpr bool __is_nothrow = //
+    noexcept(_Ty(static_cast<_Ty&&>(__obj))) && //
+    noexcept(__obj = static_cast<_Uy&&>(__new_value)); //
+  static_assert(__is_nothrow);
+
+  _Ty old_value = static_cast<_Ty&&>(__obj);
+  __obj         = static_cast<_Uy&&>(__new_value);
+  return old_value;
+}
+
+template <class _Ty>
+_CCCL_HOST_DEVICE constexpr void __swap(_Ty& __left, _Ty& __right) noexcept
+{
+  constexpr bool __is_nothrow = //
+    noexcept(_Ty(static_cast<_Ty&&>(__left))) && //
+    noexcept(__left = static_cast<_Ty&&>(__right)); //
+  static_assert(__is_nothrow);
+
+  _Ty __tmp = static_cast<_Ty&&>(__left);
+  __left    = static_cast<_Ty&&>(__right);
+  __right   = static_cast<_Ty&&>(__tmp);
+}
+
+template <class _Ty>
+_CCCL_HOST_DEVICE constexpr _Ty __decay_copy(_Ty&& __ty) noexcept(__nothrow_decay_copyable<_Ty>)
+{
+  return static_cast<_Ty&&>(__ty);
+}
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_GCC("-Wnon-template-friend")
+_CCCL_NV_DIAG_SUPPRESS(probable_guiding_friend)
+
+// __zip/__unzip is for keeping type names short. It has the unfortunate side
+// effect of obfuscating the types.
+namespace
+{
+template <size_t _Ny>
+struct __slot
+{
+  friend constexpr auto __slot_allocated(__slot<_Ny>);
+};
+
+template <class _Type, size_t _Ny>
+struct __allocate_slot
+{
+  static constexpr size_t __value = _Ny;
+
+  friend constexpr auto __slot_allocated(__slot<_Ny>)
+  {
+    return static_cast<_Type (*)()>(nullptr);
+  }
+};
+
+template <class _Type, size_t _Id = 0, size_t _Pow2 = 0>
+constexpr size_t __next(long);
+
+// If __slot_allocated(__slot<_Id>) has NOT been defined, then SFINAE will keep
+// this function out of the overload set...
+template <class _Type, //
+          size_t _Id   = 0,
+          size_t _Pow2 = 0,
+          bool         = !__slot_allocated(__slot<_Id + (1 << _Pow2) - 1>())>
+constexpr size_t __next(int)
+{
+  return __async::__next<_Type, _Id, _Pow2 + 1>(0);
+}
+
+template <class _Type, size_t _Id, size_t _Pow2>
+constexpr size_t __next(long)
+{
+  if constexpr (_Pow2 == 0)
+  {
+    return __allocate_slot<_Type, _Id>::__value;
+  }
+  else
+  {
+    return __async::__next<_Type, _Id + (1 << (_Pow2 - 1)), 0>(0);
+  }
+}
+
+// Prior to Clang 12, we can't use the __slot trick to erase long type names
+// because of a compiler bug. We'll just use the original type name in that case.
+#if defined(_CCCL_COMPILER_CLANG) && _CCCL_CLANG_VERSION < 120000
+
+template <class _Type>
+using __zip = _Type;
+
+template <class _Id>
+using __unzip = _Id;
+
+#else
+
+template <class _Type, size_t _Val = __async::__next<_Type>(0)>
+using __zip = __slot<_Val>;
+
+template <class _Id>
+using __unzip = decltype(__slot_allocated(_Id())());
+
+#endif
+
+// burn the first slot
+using __ignore_this_typedef [[maybe_unused]] = __zip<void>;
+} // namespace
+
+_CCCL_NV_DIAG_DEFAULT(probable_guiding_friend)
+_CCCL_DIAG_POP
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/variant.cuh b/cudax/include/cuda/experimental/__async/variant.cuh
new file mode 100644
index 0000000000..4f38e914cd
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/variant.cuh
@@ -0,0 +1,192 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_VARIANT
+#define __CUDAX_ASYNC_DETAIL_VARIANT
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__new/launder.h>
+
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <new> // IWYU pragma: keep
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+/********************************************************************************/
+/* NB: The variant type implemented here default-constructs into the valueless  */
+/* state. This is different from std::variant which default-constructs into the */
+/* first alternative. This is done to simplify the implementation and to avoid  */
+/* the need for a default constructor for each alternative type.                */
+/********************************************************************************/
+
+template <class _Idx, class... _Ts>
+class __variant_impl;
+
+template <>
+class __variant_impl<__mindices<>>
+{
+public:
+  template <class _Fn, class... _Us>
+  _CCCL_HOST_DEVICE void __visit(_Fn&&, _Us&&...) const noexcept
+  {}
+};
+
+template <size_t... _Idx, class... _Ts>
+class __variant_impl<__mindices<_Idx...>, _Ts...>
+{
+  static constexpr size_t __max_size = __maximum({sizeof(_Ts)...});
+  static_assert(__max_size != 0);
+  size_t __index_{__npos};
+  alignas(_Ts...) unsigned char __storage_[__max_size];
+
+  template <size_t _Ny>
+  using __at = __m_at_c<_Ny, _Ts...>;
+
+  _CCCL_HOST_DEVICE void __destroy() noexcept
+  {
+    if (__index_ != __npos)
+    {
+      // make this local in case destroying the sub-object destroys *this
+      const auto index = __async::__exchange(__index_, __npos);
+      ((_Idx == index ? _CUDA_VSTD::destroy_at(static_cast<__at<_Idx>*>(__ptr())) : void(0)), ...);
+    }
+  }
+
+public:
+  _CUDAX_IMMOVABLE(__variant_impl);
+
+  _CCCL_HOST_DEVICE __variant_impl() noexcept {}
+
+  _CCCL_HOST_DEVICE ~__variant_impl()
+  {
+    __destroy();
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void* __ptr() noexcept
+  {
+    return __storage_;
+  }
+
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE size_t __index() const noexcept
+  {
+    return __index_;
+  }
+
+  template <class _Ty, class... _As>
+  _CCCL_HOST_DEVICE _Ty& __emplace(_As&&... __as) //
+    noexcept(__nothrow_constructible<_Ty, _As...>)
+  {
+    constexpr size_t __new_index = __async::__index_of<_Ty, _Ts...>();
+    static_assert(__new_index != __npos, "_Type not in variant");
+
+    __destroy();
+    _Ty* __value = ::new (__ptr()) _Ty{static_cast<_As&&>(__as)...};
+    __index_     = __new_index;
+    return *_CUDA_VSTD::launder(__value);
+  }
+
+  template <size_t _Ny, class... _As>
+  _CCCL_HOST_DEVICE __at<_Ny>& __emplace_at(_As&&... __as) //
+    noexcept(__nothrow_constructible<__at<_Ny>, _As...>)
+  {
+    static_assert(_Ny < sizeof...(_Ts), "variant index is too large");
+
+    __destroy();
+    __at<_Ny>* __value = ::new (__ptr()) __at<_Ny>{static_cast<_As&&>(__as)...};
+    __index_           = _Ny;
+    return *_CUDA_VSTD::launder(__value);
+  }
+
+  template <class _Fn, class... _As>
+  _CCCL_HOST_DEVICE auto __emplace_from(_Fn&& __fn, _As&&... __as) //
+    noexcept(__nothrow_callable<_Fn, _As...>) -> __call_result_t<_Fn, _As...>&
+  {
+    using __result_t             = __call_result_t<_Fn, _As...>;
+    constexpr size_t __new_index = __async::__index_of<__result_t, _Ts...>();
+    static_assert(__new_index != __npos, "_Type not in variant");
+
+    __destroy();
+    __result_t* __value = ::new (__ptr()) __result_t(static_cast<_Fn&&>(__fn)(static_cast<_As&&>(__as)...));
+    __index_            = __new_index;
+    return *_CUDA_VSTD::launder(__value);
+  }
+
+  template <class _Fn, class _Self, class... _As>
+  _CCCL_HOST_DEVICE static void __visit(_Fn&& __fn, _Self&& __self, _As&&... __as) //
+    noexcept((__nothrow_callable<_Fn, _As..., __copy_cvref_t<_Self, _Ts>> && ...))
+  {
+    // make this local in case destroying the sub-object destroys *this
+    const auto index = __self.__index_;
+    _LIBCUDACXX_ASSERT(index != __npos, "");
+    ((_Idx == index
+        ? static_cast<_Fn&&>(__fn)(static_cast<_As&&>(__as)..., static_cast<_Self&&>(__self).template __get<_Idx>())
+        : void()),
+     ...);
+  }
+
+  template <size_t _Ny>
+  _CCCL_HOST_DEVICE __at<_Ny>&& __get() && noexcept
+  {
+    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    return static_cast<__at<_Ny>&&>(*static_cast<__at<_Ny>*>(__ptr()));
+  }
+
+  template <size_t _Ny>
+  _CCCL_HOST_DEVICE __at<_Ny>& __get() & noexcept
+  {
+    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    return *static_cast<__at<_Ny>*>(__ptr());
+  }
+
+  template <size_t _Ny>
+  _CCCL_HOST_DEVICE const __at<_Ny>& __get() const& noexcept
+  {
+    _LIBCUDACXX_ASSERT(_Ny == __index_, "");
+    return *static_cast<const __at<_Ny>*>(__ptr());
+  }
+};
+
+#if defined(_CCCL_COMPILER_MSVC)
+template <class... _Ts>
+struct __mk_variant_
+{
+  using __indices_t = __mmake_indices<sizeof...(_Ts)>;
+  using type        = __variant_impl<__indices_t, _Ts...>;
+};
+
+template <class... _Ts>
+using __variant = __t<__mk_variant_<_Ts...>>;
+#else
+template <class... _Ts>
+using __variant = __variant_impl<__mmake_indices<sizeof...(_Ts)>, _Ts...>;
+#endif
+
+template <class... _Ts>
+using __decayed_variant = __variant<__decay_t<_Ts>...>;
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/when_all.cuh b/cudax/include/cuda/experimental/__async/when_all.cuh
new file mode 100644
index 0000000000..8cb8d621d0
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/when_all.cuh
@@ -0,0 +1,650 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_WHEN_ALL
+#define __CUDAX_ASYNC_DETAIL_WHEN_ALL
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/atomic>
+
+#include <cuda/experimental/__async/completion_signatures.cuh>
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/lazy.cuh>
+#include <cuda/experimental/__async/meta.cuh>
+#include <cuda/experimental/__async/stop_token.cuh>
+#include <cuda/experimental/__async/tuple.cuh>
+#include <cuda/experimental/__async/type_traits.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+#include <cuda/experimental/__async/variant.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+_CCCL_DIAG_PUSH
+_CCCL_NV_DIAG_SUPPRESS(expr_has_no_effect)
+_CCCL_DIAG_SUPPRESS_GCC("-Wunused-__value")
+
+namespace cuda::experimental::__async
+{
+// Forward declare the when_all tag type:
+struct when_all_t;
+
+// Some mechanics for computing a when_all sender's completion signatures:
+namespace __when_all
+{
+template <class>
+struct __env_t;
+
+template <class, size_t>
+struct __rcvr_t;
+
+template <class, class, class>
+struct __opstate_t;
+
+using __tombstone = _ERROR<_WHERE(_IN_ALGORITHM, when_all_t), _WHAT(_SENDER_HAS_TOO_MANY_SUCCESS_COMPLETIONS)>;
+
+// Use this to short-circuit the computation of whether all values and
+// errors are nothrow decay-copyable.
+template <class _Bool>
+struct __all_nothrow_decay_copyable
+{
+  static_assert(_CUDA_VSTD::is_same_v<_Bool, __mtrue>);
+  template <class... _Ts>
+  using __f = __mbool<__nothrow_decay_copyable<_Ts...>>;
+};
+
+template <>
+struct __all_nothrow_decay_copyable<__mfalse>
+{
+  template <class... _Ts>
+  using __f = __mfalse;
+};
+
+//////////////////////////////////////////////////////////////////////////////////////
+// This type is used to compute the completion signatures contributed by one of
+// when_all's child senders. It tracks the completions, whether decay-copying the
+// values and errors can throw, and also which of the when_all's value result
+// datums this sender is responsible for setting.
+//
+// Leave this undefined:
+template <class _NothrowVals, class _NothrowErrors, class _Offsets, class... _Sigs>
+struct __completion_metadata;
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Convert the metadata type into a completion signatures type by adding a
+// set_error_t(exception_ptr) completion if decay-copying any of the values
+// or errors is possibly throwing, and then removing duplicate completion
+// signatures.
+template <class>
+struct __reduce_completions;
+
+template <class... _What>
+struct __reduce_completions<_ERROR<_What...>&>
+{
+  using type = __mpair<_ERROR<_What...>, __moffsets<>>;
+};
+
+template <class _ValsOK, class _ErrsOK, class _Offsets, class... _Sigs>
+struct __reduce_completions<__completion_metadata<_ValsOK, _ErrsOK, _Offsets, _Sigs...>&>
+{
+  using type = __mpair< //
+    __concat_completion_signatures<completion_signatures<_Sigs..., set_error_t(::std::exception_ptr)>>,
+    _Offsets>;
+};
+
+template <class _Offsets, class... _Sigs>
+struct __reduce_completions<__completion_metadata<__mtrue, __mtrue, _Offsets, _Sigs...>&>
+{
+  using type = __mpair<__concat_completion_signatures<completion_signatures<_Sigs...>>, _Offsets>;
+};
+
+template <class _Ty>
+using __reduce_completions_t = __t<__reduce_completions<_Ty>>;
+
+//////////////////////////////////////////////////////////////////////////////////////
+// __append_completion
+//
+// We use a set of partial specialization of the __append_completion variable
+// template to append the metadata from a single completion signature into a
+// metadata struct, and we use a fold expression to append all _Ny completion
+// signatures.
+template <class _Metadata, class _Sig>
+extern __undefined<_Metadata> __append_completion;
+
+template <class _ValsOK, class _ErrsOK, class... _Sigs, class _Tag, class... _As>
+extern __completion_metadata<_ValsOK,
+                             __minvoke<__all_nothrow_decay_copyable<_ErrsOK>, _As...>,
+                             __moffsets<>,
+                             _Sigs...,
+                             _Tag(__decay_t<_As>...)>&
+  __append_completion<__completion_metadata<_ValsOK, _ErrsOK, __moffsets<>, _Sigs...>, _Tag(_As...)>;
+
+// This overload is selected when we see the first set_value_t completion
+// signature.
+template <class _ValsOK, class _ErrsOK, class... _Sigs, class... _As>
+extern __completion_metadata<
+  __minvoke<__all_nothrow_decay_copyable<_ValsOK>, _As...>,
+  _ErrsOK,
+  __moffsets<>,
+  set_value_t(__decay_t<_As>...), // Insert the value signature at the front
+  _Sigs...>& __append_completion<__completion_metadata<_ValsOK, _ErrsOK, __moffsets<>, _Sigs...>, set_value_t(_As...)>;
+
+// This overload is selected when we see the second set_value_t completion
+// signature. Senders passed to when_all are only allowed one set_value
+// completion.
+template <class _ValsOK, class _ErrsOK, class... _Sigs, class... _As, class... _Bs>
+extern __tombstone&
+  __append_completion<__completion_metadata<_ValsOK, _ErrsOK, __moffsets<>, set_value_t(_As...), _Sigs...>,
+                      set_value_t(_Bs...)>;
+
+// This overload is selected when we see the second set_value_t completion
+// signature. Senders passed to when_all are only allowed one set_value
+// completion.
+template <class _Sig>
+extern __tombstone& __append_completion<__tombstone&, _Sig>;
+
+// We use a fold expression over the bitwise OR operator to append all of the
+// completion signatures from one child sender into a metadata struct.
+template <class _Metadata, class _Sig>
+auto operator|(_Metadata&, _Sig*) -> decltype(__append_completion<_Metadata, _Sig>);
+
+// The initial value of the fold expression:
+using __inner_fold_init = __completion_metadata<__mtrue, __mtrue, __moffsets<>>;
+
+template <class... _Sigs>
+using __collect_inner = //
+  decltype((__declval<__inner_fold_init&>() | ... | static_cast<_Sigs*>(nullptr)));
+
+//////////////////////////////////////////////////////////////////////////////////////
+// __merge_metadata
+//
+// After computing a metadata struct for each child sender, all the metadata
+// structs must be merged. We use a set of partial specialization of the
+// __merge_metadata variable template to merge two metadata structs into one,
+// and we use a fold expression to merge all _Ny into one.
+template <class _Meta1, class _Meta2>
+extern __undefined<_Meta1> __merge_metadata;
+
+// This specialization causes an error to be propagated.
+template <class _ValsOK, class _ErrsOK, class _Offsets, class... _LeftSigs, class... _What>
+extern _ERROR<_What...>&
+  __merge_metadata<__completion_metadata<_ValsOK, _ErrsOK, _Offsets, _LeftSigs...>, _ERROR<_What...>>;
+
+// This overload is selected with the left and right metadata are both for senders
+// that have no set_value completion signature.
+template <class _LeftValsOK,
+          class _LeftErrsOK,
+          class _Offsets,
+          class... _LeftSigs,
+          class _RightValsOK,
+          class _RightErrsOK,
+          class... _RightSigs>
+extern __completion_metadata<__mtrue, __mand<_LeftErrsOK, _RightErrsOK>, __moffsets<>, _LeftSigs..., _RightSigs...>&
+  __merge_metadata<__completion_metadata<_LeftValsOK, _LeftErrsOK, _Offsets, _LeftSigs...>,
+                   __completion_metadata<_RightValsOK, _RightErrsOK, __moffsets<>, _RightSigs...>>;
+
+// The following two specializations are selected when one of the metadata
+// structs is for a sender with no value completions. In that case, the
+// when_all can never complete successfully, so drop the other set_value
+// completion signature.
+template <class _LeftValsOK,
+          class _LeftErrsOK,
+          class _Offsets,
+          class... _As,
+          class... _LeftSigs,
+          class _RightValsOK,
+          class _RightErrsOK,
+          class... _RightSigs>
+extern __completion_metadata<__mtrue, // There will be no value completion, so values need not be copied.
+                             __mand<_LeftErrsOK, _RightErrsOK>,
+                             __moffsets<>,
+                             _LeftSigs...,
+                             _RightSigs...>&
+  __merge_metadata<__completion_metadata<_LeftValsOK, _LeftErrsOK, _Offsets, set_value_t(_As...), _LeftSigs...>,
+                   __completion_metadata<_RightValsOK, _RightErrsOK, __moffsets<>, _RightSigs...>>;
+
+template <class _LeftValsOK,
+          class _LeftErrsOK,
+          class _Offsets,
+          class... _LeftSigs,
+          class _RightValsOK,
+          class _RightErrsOK,
+          class... _As,
+          class... _RightSigs>
+extern __completion_metadata<__mtrue, // There will be no value completion, so values need not be copied.
+                             __mand<_LeftErrsOK, _RightErrsOK>,
+                             __moffsets<>,
+                             _LeftSigs...,
+                             _RightSigs...>&
+  __merge_metadata<__completion_metadata<_LeftValsOK, _LeftErrsOK, _Offsets, _LeftSigs...>,
+                   __completion_metadata<_RightValsOK, _RightErrsOK, __moffsets<>, set_value_t(_As...), _RightSigs...>>;
+
+template <size_t... _Offsets>
+_CCCL_INLINE_VAR constexpr size_t __last_offset = (0, ..., _Offsets);
+
+template <size_t _Count, size_t... _Offsets>
+using __append_offset = __moffsets<_Offsets..., _Count + __last_offset<_Offsets...>>;
+
+// This overload is selected when both metadata structs are for senders with
+// a single value completion. Concatenate the value types.
+template <class _LeftValsOK,
+          class _LeftErrsOK,
+          size_t... _Offsets,
+          class... _As,
+          class... _LeftSigs,
+          class _RightValsOK,
+          class _RightErrsOK,
+          class... _Bs,
+          class... _RightSigs>
+extern __completion_metadata<__mand<_LeftValsOK, _RightValsOK>,
+                             __mand<_LeftErrsOK, _RightErrsOK>,
+                             __append_offset<sizeof...(_Bs), _Offsets...>,
+                             set_value_t(_As..., _Bs...), // Concatenate the value types.
+                             _LeftSigs...,
+                             _RightSigs...>&
+  __merge_metadata<
+    __completion_metadata<_LeftValsOK, _LeftErrsOK, __moffsets<_Offsets...>, set_value_t(_As...), _LeftSigs...>,
+    __completion_metadata<_RightValsOK, _RightErrsOK, __moffsets<>, set_value_t(_Bs...), _RightSigs...>>;
+
+template <class... _What, class _Other>
+extern _ERROR<_What...>& __merge_metadata<_ERROR<_What...>, _Other>;
+
+// We use a fold expression over the bitwise AND operator to merge all the
+// completion metadata structs from the child senders into a single metadata
+// struct.
+template <class _Meta1, class _Meta2>
+auto operator&(_Meta1&, _Meta2&) -> decltype(__merge_metadata<_Meta1, _Meta2>);
+
+// The initial value for the fold.
+using __outer_fold_init = __completion_metadata<__mtrue, __mtrue, __moffsets<0ul>, set_value_t(), set_stopped_t()>;
+
+template <class... _Sigs>
+using __collect_outer = //
+  __reduce_completions_t<decltype((__declval<__outer_fold_init&>() & ... & __declval<_Sigs>()))>;
+
+// Extract the first template parameter of the __state_t specialization.
+// The first template parameter is the receiver type.
+template <class _State>
+using __rcvr_from_state_t = __mapply<__mpoly_q<__mfront>, _State>;
+
+/// The receivers connected to the when_all's sub-operations expose this as
+/// their environment. Its `get_stop_token` query returns the token from
+/// when_all's stop source. All other queries are forwarded to the outer
+/// receiver's environment.
+template <class _StateZip>
+struct __env_t
+{
+  using __state_t = __unzip<_StateZip>;
+  using __rcvr_t  = __rcvr_from_state_t<__state_t>;
+
+  __state_t& __state_;
+
+  _CCCL_HOST_DEVICE inplace_stop_token __query(get_stop_token_t) const noexcept
+  {
+    return __state_.__stop_token_;
+  }
+
+  template <class _Tag>
+  _CCCL_HOST_DEVICE auto query(_Tag) const noexcept -> __query_result_t<_Tag, env_of_t<__rcvr_t>>
+  {
+    return __async::get_env(__state_.__rcvr_).__query(_Tag());
+  }
+};
+
+template <class _StateZip, size_t _Index>
+struct __rcvr_t
+{
+  using receiver_concept = receiver_t;
+  using __state_t        = __unzip<_StateZip>;
+
+  __state_t& __state_;
+
+  template <class... _Ts>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_value(_Ts&&... __ts) noexcept
+  {
+    constexpr auto idx = __mmake_indices<sizeof...(_Ts)>();
+    __state_.template __set_value<_Index>(idx, static_cast<_Ts&&>(__ts)...);
+    __state_.__arrive();
+  }
+
+  template <class _Error>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE void set_error(_Error&& __error) noexcept
+  {
+    __state_.__set_error(static_cast<_Error&&>(__error));
+    __state_.__arrive();
+  }
+
+  _CCCL_HOST_DEVICE void set_stopped() noexcept
+  {
+    __state_.__set_stopped();
+    __state_.__arrive();
+  }
+
+  _CCCL_HOST_DEVICE auto get_env() const noexcept -> __env_t<_StateZip>
+  {
+    return {__state_};
+  }
+};
+
+template <class _CvSndr, size_t _Idx, class _StateZip>
+using __inner_completions_ = //
+  __mapply_q<__collect_inner, completion_signatures_of_t<_CvSndr, __rcvr_t<_StateZip, _Idx>>>;
+
+template <class _CvSndr, size_t _Idx, class _StateZip>
+using __inner_completions = //
+  __midentity_or_error_with< //
+    __inner_completions_<_CvSndr, _Idx, _StateZip>, //
+    _WITH_SENDER(_CvSndr)>;
+
+enum __estate_t : int
+{
+  __started,
+  __error,
+  __stopped
+};
+
+/// @brief The data stored in the operation state and refered to
+/// by the receiver.
+/// @tparam _Rcvr The receiver connected to the when_all sender.
+/// @tparam _CvFn A metafunction to apply cv- and ref-qualifiers to the senders
+/// @tparam _Sndrs A tuple of the when_all sender's child senders.
+template <class _Rcvr, class _CvFn, class _Sndrs>
+struct __state_t;
+
+template <class _Rcvr, class _CvFn, size_t... _Idx, class... _Sndrs>
+struct __state_t<_Rcvr, _CvFn, __tupl<__mindices<_Idx...>, _Sndrs...>>
+{
+  using __completions_offsets_pair_t = //
+    __collect_outer< //
+      __inner_completions<__minvoke1<_CvFn, _Sndrs>, _Idx, __zip<__state_t>>...>;
+  using __completions_t = __mfirst<__completions_offsets_pair_t>;
+  using __indices_t     = __mindices<_Idx...>;
+  using __offsets_t     = __msecond<__completions_offsets_pair_t>;
+  using __values_t      = __value_types<__completions_t, __lazy_tuple, __mpoly<__msingle_or<__nil>>::__f>;
+  using __errors_t      = __error_types<__completions_t, __variant>;
+
+  using __stop_tok_t      = stop_token_of_t<env_of_t<_Rcvr>>;
+  using __stop_callback_t = stop_callback_for_t<__stop_tok_t, __on_stop_request>;
+
+  _CCCL_HOST_DEVICE explicit __state_t(_Rcvr __rcvr, size_t __count)
+      : __rcvr_{static_cast<_Rcvr&&>(__rcvr)}
+      , __count_{__count}
+      , __stop_source_{}
+      , __stop_token_{__stop_source_.get_token()}
+      , __state_{__started}
+      , __errors_{}
+      , __values_{}
+      , __on_stop_{}
+  {}
+
+  template <size_t _Index, size_t... _Offsets>
+  static constexpr size_t __offset_for(__moffsets<_Offsets...>*) noexcept
+  {
+    constexpr size_t __offsets[] = {_Offsets..., 0};
+    return __offsets[_Index];
+  }
+
+  template <size_t _Index, size_t... _Jdx, class... _Ts>
+  _CCCL_HOST_DEVICE void __set_value(__mindices<_Jdx...>, [[maybe_unused]] _Ts&&... __ts) noexcept
+  {
+    [[maybe_unused]] constexpr size_t _Offset = __offset_for<_Index>(static_cast<__offsets_t*>(nullptr));
+    if constexpr (!_CUDA_VSTD::is_same_v<__values_t, __nil>)
+    {
+      if constexpr (__nothrow_decay_copyable<_Ts...>)
+      {
+        (__values_.template __emplace<_Jdx + _Offset>(static_cast<_Ts&&>(__ts)), ...);
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ({ //
+            (__values_.template __emplace<_Jdx + _Offset>(static_cast<_Ts&&>(__ts)), ...);
+          }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __set_error(::std::current_exception());
+            }))
+      }
+    }
+  }
+
+  template <class _Error>
+  _CCCL_HOST_DEVICE void __set_error(_Error&& __err) noexcept
+  {
+    // TODO: Use weaker memory orders
+    if (__error != __state_.exchange(__error))
+    {
+      __stop_source_.request_stop();
+      // We won the race, free to write the error into the operation state
+      // without worry.
+      if constexpr (__nothrow_decay_copyable<_Error>)
+      {
+        __errors_.template __emplace<__decay_t<_Error>>(static_cast<_Error&&>(__err));
+      }
+      else
+      {
+        _CUDAX_TRY( //
+          ({ //
+            __errors_.template __emplace<__decay_t<_Error>>(static_cast<_Error&&>(__err));
+          }),
+          _CUDAX_CATCH(...)( //
+            { //
+              __errors_.template __emplace<::std::exception_ptr>(::std::current_exception());
+            }))
+      }
+    }
+  }
+
+  _CCCL_HOST_DEVICE void __set_stopped() noexcept
+  {
+    _CUDA_VSTD::underlying_type_t<__estate_t> __expected = __started;
+    // Transition to the "stopped" state if and only if we're in the
+    // "started" state. (If this fails, it's because we're in an
+    // error state, which trumps cancellation.)
+    if (__state_.compare_exchange_strong(__expected, static_cast<_CUDA_VSTD::underlying_type_t<__estate_t>>(__stopped)))
+    {
+      __stop_source_.request_stop();
+    }
+  }
+
+  _CCCL_HOST_DEVICE void __arrive() noexcept
+  {
+    if (0 == --__count_)
+    {
+      __complete();
+    }
+  }
+
+  _CCCL_HOST_DEVICE void __complete() noexcept
+  {
+    // Stop callback is no longer needed. Destroy it.
+    __on_stop_.destroy();
+    // All child operations have completed and arrived at the barrier.
+    switch (__state_.load(_CUDA_VSTD::memory_order_relaxed))
+    {
+      case __started:
+        if constexpr (!_CUDA_VSTD::is_same_v<__values_t, __nil>)
+        {
+          // All child operations completed successfully:
+          __values_.__apply(__async::set_value, static_cast<__values_t&&>(__values_), static_cast<_Rcvr&&>(__rcvr_));
+        }
+        break;
+      case __error:
+        // One or more child operations completed with an error:
+        __errors_.__visit(__async::set_error, static_cast<__errors_t&&>(__errors_), static_cast<_Rcvr&&>(__rcvr_));
+        break;
+      case __stopped:
+        __async::set_stopped(static_cast<_Rcvr&&>(__rcvr_));
+        break;
+      default:;
+    }
+  }
+
+  _Rcvr __rcvr_;
+  _CUDA_VSTD::atomic<size_t> __count_;
+  inplace_stop_source __stop_source_;
+  inplace_stop_token __stop_token_;
+  _CUDA_VSTD::atomic<_CUDA_VSTD::underlying_type_t<__estate_t>> __state_;
+  __errors_t __errors_;
+  __values_t __values_;
+  __lazy<__stop_callback_t> __on_stop_;
+};
+
+/// The operation state for when_all
+template <class _Rcvr, class _CvFn, size_t... _Idx, class... _Sndrs>
+struct __opstate_t<_Rcvr, _CvFn, __tupl<__mindices<_Idx...>, _Sndrs...>>
+{
+  using operation_state_concept = operation_state_t;
+  using __sndrs_t               = __minvoke<_CvFn, __tuple<_Sndrs...>>;
+  using __state_t               = __when_all::__state_t<_Rcvr, _CvFn, __tupl<__mindices<_Idx...>, _Sndrs...>>;
+
+  using completion_signatures = typename __state_t::__completions_t;
+  using __offsets_t           = typename __state_t::__offsets_t;
+
+  // This function object is used to connect all the sub-operations with
+  // receivers, each of which knows which elements in the values tuple it
+  // is responsible for setting.
+  struct __connect_subs_fn
+  {
+    template <class... _CvSndrs>
+    _CCCL_HOST_DEVICE auto operator()(__state_t& __state, _CvSndrs&&... __sndrs_) const
+    {
+      using __state_ref_t = __zip<__state_t>;
+      if constexpr (_CUDA_VSTD::is_same_v<__offsets_t, __moffsets<>>)
+      {
+        // When there are no offsets, the when_all sender has no value
+        // completions. All child senders can be connected to receivers
+        // of the same type.
+        return __tupl{__async::connect(static_cast<_CvSndrs&&>(__sndrs_), __rcvr_t<__state_ref_t, 0>{__state})...};
+      }
+      else
+      {
+        // The offsets are used to determine which elements in the values
+        // tuple each receiver is responsible for setting.
+        return __tupl{__async::connect(static_cast<_CvSndrs&&>(__sndrs_), __rcvr_t<__state_ref_t, _Idx>{__state})...};
+      }
+    }
+  };
+
+  // This is a tuple of operation states for the sub-operations.
+  using __sub_opstates_t = __apply_result_t<__connect_subs_fn, __sndrs_t, __state_t&>;
+
+  __state_t __state_;
+  __sub_opstates_t __sub_ops_;
+
+  /// Initialize the data member, connect all the sub-operations and
+  /// save the resulting operation states in __sub_ops_.
+  _CCCL_HOST_DEVICE __opstate_t(__sndrs_t&& __sndrs_, _Rcvr __rcvr)
+      : __state_{static_cast<_Rcvr&&>(__rcvr), sizeof...(_Sndrs)}
+      , __sub_ops_{__sndrs_.__apply(__connect_subs_fn(), static_cast<__sndrs_t&&>(__sndrs_), __state_)}
+  {}
+
+  _CUDAX_IMMOVABLE(__opstate_t);
+
+  /// Start all the sub-operations.
+  _CCCL_HOST_DEVICE void start() & noexcept
+  {
+    // register stop callback:
+    __state_.__on_stop_.construct(
+      get_stop_token(__async::get_env(__state_.__rcvr_)), __on_stop_request{__state_.__stop_source_});
+
+    if (__state_.__stop_source_.stop_requested())
+    {
+      // Manually clean up the stop callback. We won't be starting the
+      // sub-operations, so they won't complete and clean up for us.
+      __state_.__on_stop_.destroy();
+
+      // Stop has already been requested. Don't bother starting the child
+      // operations.
+      __async::set_stopped(static_cast<_Rcvr&&>(__state_.__rcvr_));
+    }
+    else
+    {
+      // Start all the sub-operations.
+      __sub_ops_.__for_each(__async::start, __sub_ops_);
+
+      // If there are no sub-operations, we're done.
+      if constexpr (sizeof...(_Sndrs) == 0)
+      {
+        __state_.__complete();
+      }
+    }
+  }
+};
+
+template <class... _Sndrs>
+struct __sndr_t;
+} // namespace __when_all
+
+struct when_all_t
+{
+  template <class... _Sndrs>
+  _CCCL_HOST_DEVICE __when_all::__sndr_t<_Sndrs...> operator()(_Sndrs... __sndrs_) const;
+};
+
+// The sender for when_all
+template <class... _Sndrs>
+struct __when_all::__sndr_t
+{
+  using sender_concept = sender_t;
+  using __sndrs_t      = __tuple<_Sndrs...>;
+
+  _CCCL_NO_UNIQUE_ADDRESS when_all_t __tag_;
+  _CCCL_NO_UNIQUE_ADDRESS __ignore __ignore1_;
+  __sndrs_t __sndrs_;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && -> __opstate_t<_Rcvr, __cp, __sndrs_t>
+  {
+    return __opstate_t<_Rcvr, __cp, __sndrs_t>(static_cast<__sndrs_t&&>(__sndrs_), static_cast<_Rcvr&&>(__rcvr));
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& //
+    -> __opstate_t<_Rcvr, __cpclr, __sndrs_t>
+  {
+    return __opstate_t<_Rcvr, __cpclr, __sndrs_t>(__sndrs_, static_cast<_Rcvr&&>(__rcvr));
+  }
+};
+
+template <class... _Sndrs>
+_CCCL_HOST_DEVICE __when_all::__sndr_t<_Sndrs...> when_all_t::operator()(_Sndrs... __sndrs_) const
+{
+  // If the incoming sender is non-dependent, we can check the completion
+  // signatures of the composed sender immediately.
+  if constexpr ((__is_non_dependent_sender<_Sndrs> && ...))
+  {
+    using __completions = completion_signatures_of_t<__when_all::__sndr_t<_Sndrs...>>;
+    static_assert(__is_completion_signatures<__completions>);
+  }
+  return __when_all::__sndr_t<_Sndrs...>{{}, {}, {static_cast<_Sndrs&&>(__sndrs_)...}};
+}
+
+_CCCL_GLOBAL_CONSTANT when_all_t when_all{};
+
+} // namespace cuda::experimental::__async
+
+_CCCL_NV_DIAG_DEFAULT(expr_has_no_effect)
+_CCCL_DIAG_POP
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__async/write_env.cuh b/cudax/include/cuda/experimental/__async/write_env.cuh
new file mode 100644
index 0000000000..2bc2e5064e
--- /dev/null
+++ b/cudax/include/cuda/experimental/__async/write_env.cuh
@@ -0,0 +1,118 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_ASYNC_DETAIL_WRITE_ENV
+#define __CUDAX_ASYNC_DETAIL_WRITE_ENV
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/experimental/__async/config.cuh>
+#include <cuda/experimental/__async/cpos.cuh>
+#include <cuda/experimental/__async/env.cuh>
+#include <cuda/experimental/__async/exception.cuh>
+#include <cuda/experimental/__async/queries.cuh>
+#include <cuda/experimental/__async/rcvr_with_env.cuh>
+#include <cuda/experimental/__async/utility.cuh>
+
+#include <cuda/experimental/__async/prologue.cuh>
+
+namespace cuda::experimental::__async
+{
+struct write_env_t
+{
+#if !defined(_CCCL_CUDA_COMPILER_NVCC)
+
+private:
+#endif // _CCCL_CUDA_COMPILER_NVCC
+  template <class _Rcvr, class _Sndr, class _Env>
+  struct __opstate_t
+  {
+    using operation_state_concept = operation_state_t;
+    using completion_signatures   = completion_signatures_of_t<_Sndr, __rcvr_with_env_t<_Rcvr, _Env>*>;
+
+    __rcvr_with_env_t<_Rcvr, _Env> __env_rcvr_;
+    connect_result_t<_Sndr, __rcvr_with_env_t<_Rcvr, _Env>*> __opstate_;
+
+    _CCCL_HOST_DEVICE explicit __opstate_t(_Sndr&& __sndr, _Env __env, _Rcvr __rcvr)
+        : __env_rcvr_(static_cast<_Env&&>(__env), static_cast<_Rcvr&&>(__rcvr))
+        , __opstate_(__async::connect(static_cast<_Sndr&&>(__sndr), &__env_rcvr_))
+    {}
+
+    _CUDAX_IMMOVABLE(__opstate_t);
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      __async::start(__opstate_);
+    }
+  };
+
+  template <class _Sndr, class _Env>
+  struct __sndr_t;
+
+public:
+  /// @brief Wraps one sender in another that modifies the execution
+  /// environment by merging in the environment specified.
+  template <class _Sndr, class _Env>
+  _CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE constexpr auto operator()(_Sndr, _Env) const //
+    -> __sndr_t<_Sndr, _Env>;
+};
+
+template <class _Sndr, class _Env>
+struct write_env_t::__sndr_t
+{
+  using sender_concept = sender_t;
+  _CCCL_NO_UNIQUE_ADDRESS write_env_t __tag_;
+  _Env __env_;
+  _Sndr __sndr_;
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) && -> __opstate_t<_Rcvr, _Sndr, _Env>
+  {
+    return __opstate_t<_Rcvr, _Sndr, _Env>{
+      static_cast<_Sndr&&>(__sndr_), static_cast<_Env&&>(__env_), static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  template <class _Rcvr>
+  _CCCL_HOST_DEVICE auto connect(_Rcvr __rcvr) const& //
+    -> __opstate_t<_Rcvr, const _Sndr&, _Env>
+  {
+    return __opstate_t<_Rcvr, const _Sndr&, _Env>{__sndr_, __env_, static_cast<_Rcvr&&>(__rcvr)};
+  }
+
+  _CCCL_HOST_DEVICE env_of_t<_Sndr> get_env() const noexcept
+  {
+    return __async::get_env(__sndr_);
+  }
+};
+
+template <class _Sndr, class _Env>
+_CCCL_HOST_DEVICE _CUDAX_ALWAYS_INLINE constexpr auto write_env_t::operator()(_Sndr __sndr, _Env __env) const //
+  -> write_env_t::__sndr_t<_Sndr, _Env>
+{
+  return write_env_t::__sndr_t<_Sndr, _Env>{{}, static_cast<_Env&&>(__env), static_cast<_Sndr&&>(__sndr)};
+}
+
+_CCCL_GLOBAL_CONSTANT write_env_t write_env{};
+
+} // namespace cuda::experimental::__async
+
+#include <cuda/experimental/__async/epilogue.cuh>
+
+#endif
diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.h b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
similarity index 78%
rename from cudax/include/cuda/experimental/__container/uninitialized_buffer.h
rename to cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
index 105d939322..817b9782af 100644
--- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.h
+++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
@@ -25,12 +25,17 @@
 #include <cuda/std/__concepts/_One_of.h>
 #include <cuda/std/__memory/align.h>
 #include <cuda/std/__new/launder.h>
+#include <cuda/std/__utility/move.h>
 #include <cuda/std/__utility/swap.h>
+#include <cuda/std/span>
+
+#include <cuda/experimental/__memory_resource/any_resource.cuh>
 
 #if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \
   && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
-//! @file The \c uninitialized_buffer class provides a typed buffer allocated from a given memory resource.
+//! @file
+//! The \c uninitialized_buffer class provides a typed buffer allocated from a given memory resource.
 namespace cuda::experimental
 {
 
@@ -50,20 +55,14 @@ namespace cuda::experimental
 //! need to implement :ref:`get_property(const device_buffer&, Property)
 //! <libcudacxx-extended-api-memory-resources-properties>`.
 //!
-//! .. warning::
-//!
-//!    ``uninitialized_buffer`` stores a reference to the provided memory :ref:`memory resource
-//!    <libcudacxx-extended-api-memory-resources-resource>`. It is the user's resposibility to ensure the lifetime of
-//!    the resource exceeds the lifetime of the buffer.
-//!
 //! @endrst
-//! @tparam _T the type to be stored in the buffer
+//! @tparam _Tp the type to be stored in the buffer
 //! @tparam _Properties... The properties the allocated memory satisfies
 template <class _Tp, class... _Properties>
 class uninitialized_buffer
 {
 private:
-  _CUDA_VMR::resource_ref<_Properties...> __mr_;
+  ::cuda::experimental::mr::any_resource<_Properties...> __mr_;
   size_t __count_ = 0;
   void* __buf_    = nullptr;
 
@@ -84,6 +83,26 @@ class uninitialized_buffer
       reinterpret_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space)));
   }
 
+  //! @brief Causes the buffer to be treated as a span when passed to cudax::launch.
+  //! @pre The buffer must have the cuda::mr::device_accessible property.
+  _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp>
+  __cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept
+  {
+    static_assert(_CUDA_VSTD::_One_of<_CUDA_VMR::device_accessible, _Properties...>,
+                  "The buffer must be device accessible to be passed to `launch`");
+    return {__self.__get_data(), __self.size()};
+  }
+
+  //! @brief Causes the buffer to be treated as a span when passed to cudax::launch
+  //! @pre The buffer must have the cuda::mr::device_accessible property.
+  _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<const _Tp>
+  __cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept
+  {
+    static_assert(_CUDA_VSTD::_One_of<_CUDA_VMR::device_accessible, _Properties...>,
+                  "The buffer must be device accessible to be passed to `launch`");
+    return {__self.__get_data(), __self.size()};
+  }
+
 public:
   using value_type = _Tp;
   using reference  = _Tp&;
@@ -96,8 +115,8 @@ class uninitialized_buffer
   //! @note Depending on the alignment requirements of `T` the size of the underlying allocation might be larger
   //! than `count * sizeof(T)`.
   //! @note Only allocates memory when \p __count > 0
-  uninitialized_buffer(_CUDA_VMR::resource_ref<_Properties...> __mr, const size_t __count)
-      : __mr_(__mr)
+  uninitialized_buffer(::cuda::experimental::mr::any_resource<_Properties...> __mr, const size_t __count)
+      : __mr_(_CUDA_VSTD::move(__mr))
       , __count_(__count)
       , __buf_(__count_ == 0 ? nullptr : __mr_.allocate(__get_allocation_size(__count_)))
   {}
@@ -108,7 +127,7 @@ class uninitialized_buffer
   //! @brief Move construction
   //! @param __other Another \c uninitialized_buffer
   uninitialized_buffer(uninitialized_buffer&& __other) noexcept
-      : __mr_(__other.__mr_)
+      : __mr_(_CUDA_VSTD::move(__other.__mr_))
       , __count_(__other.__count_)
       , __buf_(__other.__buf_)
   {
@@ -129,7 +148,7 @@ class uninitialized_buffer
     {
       __mr_.deallocate(__buf_, __get_allocation_size(__count_));
     }
-    __mr_            = __other.__mr_;
+    __mr_            = _CUDA_VSTD::move(__other.__mr_);
     __count_         = __other.__count_;
     __buf_           = __other.__buf_;
     __other.__count_ = 0;
@@ -173,19 +192,20 @@ class uninitialized_buffer
   }
 
   //! @rst
-  //! Returns the :ref:`resource_ref <libcudacxx-extended-api-memory-resources-resource-ref>` used to allocate
-  //! the buffer
+  //! Returns a :ref:`resource_ref <libcudacxx-extended-api-memory-resources-resource-ref>` to the resource used to
+  //! allocate the buffer
   //! @endrst
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_NODISCARD _CCCL_HOST_DEVICE _CUDA_VMR::resource_ref<_Properties...> resource() const noexcept
   {
-    return __mr_;
+    return _CUDA_VMR::resource_ref<_Properties...>{const_cast<uninitialized_buffer*>(this)->__mr_};
   }
 
   //! @brief Swaps the contents with those of another \c uninitialized_buffer
   //! @param __other The other \c uninitialized_buffer.
   _CCCL_HOST_DEVICE constexpr void swap(uninitialized_buffer& __other) noexcept
   {
-    _CUDA_VSTD::swap(__mr_, __other.__mr_);
+    __mr_.swap(__other.__mr_);
     _CUDA_VSTD::swap(__count_, __other.__count_);
     _CUDA_VSTD::swap(__buf_, __other.__buf_);
   }
diff --git a/cudax/include/cuda/experimental/__device/all_devices.cuh b/cudax/include/cuda/experimental/__device/all_devices.cuh
index 3bd17f5fac..6f8e585970 100644
--- a/cudax/include/cuda/experimental/__device/all_devices.cuh
+++ b/cudax/include/cuda/experimental/__device/all_devices.cuh
@@ -189,6 +189,11 @@ inline const ::std::vector<device>& all_devices::__devices()
 //! * device_ref
 inline constexpr detail::all_devices devices{};
 
+inline const arch_traits_t& device_ref::arch_traits() const
+{
+  return devices[get()].arch_traits();
+}
+
 } // namespace cuda::experimental
 
 #endif // _CUDAX__DEVICE_ALL_DEVICES
diff --git a/cudax/include/cuda/experimental/__device/arch_traits.cuh b/cudax/include/cuda/experimental/__device/arch_traits.cuh
new file mode 100644
index 0000000000..dd6d5c863f
--- /dev/null
+++ b/cudax/include/cuda/experimental/__device/arch_traits.cuh
@@ -0,0 +1,429 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__DEVICE_ARCH_TRAITS
+#define _CUDAX__DEVICE_ARCH_TRAITS
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__exception/cuda_error.h>
+#include <cuda/std/limits>
+
+#include <cuda/experimental/__device/attributes.cuh>
+
+namespace cuda::experimental
+{
+
+namespace detail
+{
+struct arch_common_traits
+{
+  // Maximum number of threads per block
+  static constexpr int max_threads_per_block = 1024;
+
+  // Maximum x-dimension of a block
+  static constexpr int max_block_dim_x = 1024;
+
+  // Maximum y-dimension of a block
+  static constexpr int max_block_dim_y = 1024;
+
+  // Maximum z-dimension of a block
+  static constexpr int max_block_dim_z = 64;
+
+  // Maximum x-dimension of a grid
+  static constexpr int max_grid_dim_x = cuda::std::numeric_limits<int>::max();
+
+  // Maximum y-dimension of a grid
+  static constexpr int max_grid_dim_y = 64 * 1024 - 1;
+
+  // Maximum z-dimension of a grid
+  static constexpr int max_grid_dim_z = 64 * 1024 - 1;
+
+  // Maximum amount of shared memory available to a thread block in bytes
+  static constexpr int max_shared_memory_per_block = 48 * 1024;
+
+  // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
+  static constexpr int total_constant_memory = 64 * 1024;
+
+  // Warp size in threads
+  static constexpr int warp_size = 32;
+
+  // Maximum number of concurrent grids on the device
+  static constexpr int max_resident_grids = 128;
+
+  // true if the device can concurrently copy memory between host and device
+  // while executing a kernel, or false if not
+  static constexpr bool gpu_overlap = true;
+
+  // true if the device can map host memory into CUDA address space
+  static constexpr bool can_map_host_memory = true;
+
+  // true if the device supports executing multiple kernels within the same
+  // context simultaneously, or false if not. It is not guaranteed that multiple
+  // kernels will be resident on the device concurrently so this feature should
+  // not be relied upon for correctness.
+  static constexpr bool concurrent_kernels = true;
+
+  // true if the device supports stream priorities, or false if not
+  static constexpr bool stream_priorities_supported = true;
+
+  // true if device supports caching globals in L1 cache, false if not
+  static constexpr bool global_l1_cache_supported = true;
+
+  // true if device supports caching locals in L1 cache, false if not
+  static constexpr bool local_l1_cache_supported = true;
+
+  // TODO: We might want to have these per-arch
+  // Maximum number of 32-bit registers available to a thread block
+  static constexpr int max_registers_per_block = 64 * 1024;
+
+  // Maximum number of 32-bit registers available to a multiprocessor; this
+  // number is shared by all thread blocks simultaneously resident on a
+  // multiprocessor
+  static constexpr int max_registers_per_multiprocessor = 64 * 1024;
+
+  // Maximum number of 32-bit registers available to a thread
+  static constexpr int max_registers_per_thread = 255;
+};
+} // namespace detail
+
+struct arch_traits_t : public detail::arch_common_traits
+{
+  // Major compute capability version number
+  int compute_capability_major;
+
+  // Minor compute capability version number
+  int compute_capability_minor;
+
+  // Compute capability version number in 100 * major + 10 * minor format
+  int compute_capability;
+
+  // Maximum amount of shared memory available to a multiprocessor in bytes;
+  // this amount is shared by all thread blocks simultaneously resident on a
+  // multiprocessor
+  int max_shared_memory_per_multiprocessor;
+
+  // Maximum number of thread blocks that can reside on a multiprocessor
+  int max_blocks_per_multiprocessor;
+
+  // Maximum resident threads per multiprocessor
+  int max_threads_per_multiprocessor;
+
+  // Maximum resident warps per multiprocessor
+  int max_warps_per_multiprocessor;
+
+  // Shared memory reserved by CUDA driver per block in bytes
+  int reserved_shared_memory_per_block;
+
+  // Maximum per block shared memory size on the device. This value can be opted
+  // into when using dynamic_shared_memory with NonPortableSize set to true
+  int max_shared_memory_per_block_optin;
+
+  // TODO: Do we want these?:
+  // true if architecture supports clusters
+  bool cluster_supported;
+
+  // true if architecture supports redux intrinsic instructions
+  bool redux_intrinisic;
+
+  // true if architecture supports elect intrinsic instructions
+  bool elect_intrinsic;
+
+  // true if architecture supports asynchronous copy instructions
+  bool cp_async_supported;
+
+  // true if architecture supports tensor memory access instructions
+  bool tma_supported;
+};
+
+namespace detail
+{
+
+inline constexpr arch_traits_t sm_600_traits = []() constexpr {
+  arch_traits_t __traits{};
+  __traits.compute_capability_major             = 6;
+  __traits.compute_capability_minor             = 0;
+  __traits.compute_capability                   = 600;
+  __traits.max_shared_memory_per_multiprocessor = 64 * 1024;
+  __traits.max_blocks_per_multiprocessor        = 32;
+  __traits.max_threads_per_multiprocessor       = 2048;
+  __traits.max_warps_per_multiprocessor =
+    __traits.max_threads_per_multiprocessor / detail::arch_common_traits::warp_size;
+  __traits.reserved_shared_memory_per_block  = 0;
+  __traits.max_shared_memory_per_block_optin = 48 * 1024;
+
+  __traits.cluster_supported  = false;
+  __traits.redux_intrinisic   = false;
+  __traits.elect_intrinsic    = false;
+  __traits.cp_async_supported = false;
+  __traits.tma_supported      = false;
+
+  return __traits;
+}();
+
+inline constexpr arch_traits_t sm_700_traits = []() constexpr {
+  arch_traits_t __traits{};
+  __traits.compute_capability_major             = 7;
+  __traits.compute_capability_minor             = 0;
+  __traits.compute_capability                   = 700;
+  __traits.max_shared_memory_per_multiprocessor = 96 * 1024;
+  __traits.max_blocks_per_multiprocessor        = 32;
+  __traits.max_threads_per_multiprocessor       = 2048;
+  __traits.max_warps_per_multiprocessor =
+    __traits.max_threads_per_multiprocessor / detail::arch_common_traits::warp_size;
+  __traits.reserved_shared_memory_per_block = 0;
+  __traits.max_shared_memory_per_block_optin =
+    __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
+
+  __traits.cluster_supported  = false;
+  __traits.redux_intrinisic   = false;
+  __traits.elect_intrinsic    = false;
+  __traits.cp_async_supported = false;
+  __traits.tma_supported      = false;
+
+  return __traits;
+}();
+
+inline constexpr arch_traits_t sm_750_traits = []() constexpr {
+  arch_traits_t __traits{};
+  __traits.compute_capability_major             = 7;
+  __traits.compute_capability_minor             = 5;
+  __traits.compute_capability                   = 750;
+  __traits.max_shared_memory_per_multiprocessor = 64 * 1024;
+  __traits.max_blocks_per_multiprocessor        = 16;
+  __traits.max_threads_per_multiprocessor       = 1024;
+  __traits.max_warps_per_multiprocessor =
+    __traits.max_threads_per_multiprocessor / detail::arch_common_traits::warp_size;
+  __traits.reserved_shared_memory_per_block = 0;
+  __traits.max_shared_memory_per_block_optin =
+    __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
+
+  __traits.cluster_supported  = false;
+  __traits.redux_intrinisic   = false;
+  __traits.elect_intrinsic    = false;
+  __traits.cp_async_supported = false;
+  __traits.tma_supported      = false;
+
+  return __traits;
+}();
+
+inline constexpr arch_traits_t sm_800_traits = []() constexpr {
+  arch_traits_t __traits{};
+  __traits.compute_capability_major             = 8;
+  __traits.compute_capability_minor             = 0;
+  __traits.compute_capability                   = 800;
+  __traits.max_shared_memory_per_multiprocessor = 164 * 1024;
+  __traits.max_blocks_per_multiprocessor        = 32;
+  __traits.max_threads_per_multiprocessor       = 2048;
+  __traits.max_warps_per_multiprocessor =
+    __traits.max_threads_per_multiprocessor / detail::arch_common_traits::warp_size;
+  __traits.reserved_shared_memory_per_block = 1024;
+  __traits.max_shared_memory_per_block_optin =
+    __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
+
+  __traits.cluster_supported  = false;
+  __traits.redux_intrinisic   = true;
+  __traits.elect_intrinsic    = false;
+  __traits.cp_async_supported = true;
+  __traits.tma_supported      = false;
+
+  return __traits;
+}();
+
+inline constexpr arch_traits_t sm_860_traits = []() constexpr {
+  arch_traits_t __traits{};
+  __traits.compute_capability_major             = 8;
+  __traits.compute_capability_minor             = 6;
+  __traits.compute_capability                   = 860;
+  __traits.max_shared_memory_per_multiprocessor = 100 * 1024;
+  __traits.max_blocks_per_multiprocessor        = 16;
+  __traits.max_threads_per_multiprocessor       = 1536;
+  __traits.max_warps_per_multiprocessor =
+    __traits.max_threads_per_multiprocessor / detail::arch_common_traits::warp_size;
+  __traits.reserved_shared_memory_per_block = 1024;
+  __traits.max_shared_memory_per_block_optin =
+    __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
+
+  __traits.cluster_supported  = false;
+  __traits.redux_intrinisic   = true;
+  __traits.elect_intrinsic    = false;
+  __traits.cp_async_supported = true;
+  __traits.tma_supported      = false;
+
+  return __traits;
+}();
+
+inline constexpr arch_traits_t sm_890_traits = []() constexpr {
+  arch_traits_t __traits{};
+  __traits.compute_capability_major             = 8;
+  __traits.compute_capability_minor             = 9;
+  __traits.compute_capability                   = 890;
+  __traits.max_shared_memory_per_multiprocessor = 100 * 1024;
+  __traits.max_blocks_per_multiprocessor        = 24;
+  __traits.max_threads_per_multiprocessor       = 1536;
+  __traits.max_warps_per_multiprocessor =
+    __traits.max_threads_per_multiprocessor / detail::arch_common_traits::warp_size;
+  __traits.reserved_shared_memory_per_block = 1024;
+  __traits.max_shared_memory_per_block_optin =
+    __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
+
+  __traits.cluster_supported  = false;
+  __traits.redux_intrinisic   = true;
+  __traits.elect_intrinsic    = false;
+  __traits.cp_async_supported = true;
+  __traits.tma_supported      = false;
+
+  return __traits;
+}();
+
+inline constexpr arch_traits_t sm_900_traits = []() constexpr {
+  arch_traits_t __traits{};
+  __traits.compute_capability_major             = 9;
+  __traits.compute_capability_minor             = 0;
+  __traits.compute_capability                   = 900;
+  __traits.max_shared_memory_per_multiprocessor = 228 * 1024;
+  __traits.max_blocks_per_multiprocessor        = 32;
+  __traits.max_threads_per_multiprocessor       = 2048;
+  __traits.max_warps_per_multiprocessor =
+    __traits.max_threads_per_multiprocessor / detail::arch_common_traits::warp_size;
+  __traits.reserved_shared_memory_per_block = 1024;
+  __traits.max_shared_memory_per_block_optin =
+    __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
+
+  __traits.cluster_supported  = true;
+  __traits.redux_intrinisic   = true;
+  __traits.elect_intrinsic    = true;
+  __traits.cp_async_supported = true;
+  __traits.tma_supported      = true;
+
+  return __traits;
+}();
+
+inline constexpr unsigned int __highest_known_arch = 900;
+
+} // namespace detail
+
+//! @brief Retrieve architecture traits of the specified architecture
+//!
+//! @param __sm_version Compute capability in 100 * major + 10 * minor format for which the architecture traits are
+//! requested
+//!
+//! @throws cuda_error if the requested architecture is unknown
+_CCCL_HOST_DEVICE inline constexpr arch_traits_t arch_traits(unsigned int __sm_version)
+{
+  switch (__sm_version)
+  {
+    case 600:
+      return detail::sm_600_traits;
+    case 700:
+      return detail::sm_700_traits;
+    case 750:
+      return detail::sm_750_traits;
+    case 800:
+      return detail::sm_800_traits;
+    case 860:
+      return detail::sm_860_traits;
+    case 890:
+      return detail::sm_890_traits;
+    case 900:
+      return detail::sm_900_traits;
+    default:
+      __throw_cuda_error(cudaErrorInvalidValue, "Traits requested for an unknown architecture");
+      break;
+  }
+}
+
+//! @brief Type representing a CUDA device architecture. It provides traits from arch_traits_t in form of static members
+template <unsigned int __SmVersion>
+struct arch : public detail::arch_common_traits
+{
+private:
+  static constexpr arch_traits_t __traits = arch_traits(__SmVersion);
+
+public:
+  static constexpr int compute_capability_major             = __traits.compute_capability_major;
+  static constexpr int compute_capability_minor             = __traits.compute_capability_minor;
+  static constexpr int compute_capability                   = __traits.compute_capability;
+  static constexpr int max_shared_memory_per_multiprocessor = __traits.max_shared_memory_per_multiprocessor;
+  static constexpr int max_warps_per_multiprocessor         = __traits.max_warps_per_multiprocessor;
+  static constexpr int max_blocks_per_multiprocessor        = __traits.max_blocks_per_multiprocessor;
+  static constexpr int max_threads_per_multiprocessor       = __traits.max_threads_per_multiprocessor;
+  static constexpr int reserved_shared_memory_per_block     = __traits.reserved_shared_memory_per_block;
+  static constexpr int max_shared_memory_per_block_optin    = __traits.max_shared_memory_per_block_optin;
+
+  static constexpr bool cluster_supported  = __traits.cluster_supported;
+  static constexpr bool redux_intrinisic   = __traits.redux_intrinisic;
+  static constexpr bool elect_intrinsic    = __traits.elect_intrinsic;
+  static constexpr bool cp_async_supported = __traits.cp_async_supported;
+  static constexpr bool tma_supported      = __traits.tma_supported;
+
+  constexpr operator arch_traits_t() const
+  {
+    return __traits;
+  }
+};
+
+//! @brief Provides architecture traits of the architecture matching __CUDA_ARCH__ macro
+_CCCL_DEVICE constexpr inline arch_traits_t current_arch()
+{
+#ifdef __CUDA_ARCH__
+  return arch_traits(__CUDA_ARCH__);
+#else
+  // Should be unreachable in __device__ function
+  return arch_traits_t{};
+#endif
+}
+
+namespace detail
+{
+_CCCL_NODISCARD inline constexpr arch_traits_t __arch_traits_might_be_unknown(int __device, unsigned int __arch)
+{
+  if (__arch <= __highest_known_arch)
+  {
+    return arch_traits(__arch);
+  }
+  else
+  {
+    // If the architecture is unknown, we need to craft the arch_traits from attributes
+    arch_traits_t __traits{};
+    __traits.compute_capability_major = __arch / 100;
+    __traits.compute_capability_minor = (__arch / 10) % 10;
+    __traits.compute_capability       = __arch;
+    __traits.max_shared_memory_per_multiprocessor =
+      detail::__device_attrs::max_shared_memory_per_multiprocessor(__device);
+    __traits.max_blocks_per_multiprocessor  = detail::__device_attrs::max_blocks_per_multiprocessor(__device);
+    __traits.max_threads_per_multiprocessor = detail::__device_attrs::max_threads_per_multiprocessor(__device);
+    __traits.max_warps_per_multiprocessor =
+      __traits.max_threads_per_multiprocessor / detail::arch_common_traits::warp_size;
+    __traits.reserved_shared_memory_per_block = detail::__device_attrs::reserved_shared_memory_per_block(__device);
+    __traits.max_shared_memory_per_block_optin =
+      __traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
+
+    __traits.cluster_supported  = __arch >= 900;
+    __traits.redux_intrinisic   = __arch >= 800;
+    __traits.elect_intrinsic    = __arch >= 900;
+    __traits.cp_async_supported = __arch >= 800;
+    __traits.tma_supported      = __arch >= 900;
+    return __traits;
+  }
+}
+} // namespace detail
+
+} // namespace cuda::experimental
+
+#endif // _CUDAX__DEVICE_ARCH_TRAITS
diff --git a/cudax/include/cuda/experimental/__device/attributes.cuh b/cudax/include/cuda/experimental/__device/attributes.cuh
index 5a873f6ebb..8cbbfbe6ab 100644
--- a/cudax/include/cuda/experimental/__device/attributes.cuh
+++ b/cudax/include/cuda/experimental/__device/attributes.cuh
@@ -24,13 +24,21 @@
 #include <cuda/std/__cccl/attributes.h>
 #include <cuda/std/__cuda/api_wrapper.h>
 
-#include <cuda/experimental/__device/device.cuh>
+#include <cuda/experimental/__device/device_ref.cuh>
 
 namespace cuda::experimental
 {
 
 namespace detail
 {
+
+_CCCL_NODISCARD inline int __get_attr_impl(::cudaDeviceAttr __attr, int __dev)
+{
+  int __value = 0;
+  _CCCL_TRY_CUDA_API(::cudaDeviceGetAttribute, "failed to get device attribute", &__value, __attr, __dev);
+  return __value;
+}
+
 template <::cudaDeviceAttr _Attr>
 struct __dev_attr
 {
@@ -43,7 +51,7 @@ struct __dev_attr
 
   _CCCL_NODISCARD type operator()(device_ref __dev_id) const
   {
-    return __dev_id.attr<_Attr>();
+    return __get_attr_impl(_Attr, __dev_id.get());
   }
 };
 
@@ -59,7 +67,7 @@ struct __dev_attr_with_type
 
   _CCCL_NODISCARD type operator()(device_ref __dev_id) const
   {
-    return __dev_id.attr<_Attr>();
+    return static_cast<type>(__get_attr_impl(_Attr, __dev_id.get()));
   }
 };
 
@@ -249,9 +257,8 @@ struct __dev_attr<::cudaDevAttrNumaConfig> //
   static constexpr type numa_node = ::cudaDeviceNumaConfigNumaNode;
 };
 #endif
-} // namespace detail
 
-struct device::attrs
+struct __device_attrs
 {
   // Maximum number of threads per block
   using max_threads_per_block_t = detail::__dev_attr<::cudaDevAttrMaxThreadsPerBlock>;
@@ -473,8 +480,8 @@ struct device::attrs
   static constexpr gpu_overlap_t gpu_overlap{};
 
   // Number of multiprocessors on the device
-  using multi_processor_count_t = detail::__dev_attr<::cudaDevAttrMultiProcessorCount>;
-  static constexpr multi_processor_count_t multi_processor_count{};
+  using multiprocessor_count_t = detail::__dev_attr<::cudaDevAttrMultiProcessorCount>;
+  static constexpr multiprocessor_count_t multiprocessor_count{};
 
   // true if there is a run time limit for kernels executed on the device, or
   // false if not
@@ -485,7 +492,7 @@ struct device::attrs
   using integrated_t = detail::__dev_attr<::cudaDevAttrIntegrated>;
   static constexpr integrated_t integrated{};
 
-  // true if the d
+  // true if the device can map host memory into CUDA address space
   using can_map_host_memory_t = detail::__dev_attr<::cudaDevAttrCanMapHostMemory>;
   static constexpr can_map_host_memory_t can_map_host_memory{};
 
@@ -531,8 +538,8 @@ struct device::attrs
   static constexpr l2_cache_size_t l2_cache_size{};
 
   // Maximum resident threads per multiprocessor
-  using max_threads_per_multi_processor_t = detail::__dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
-  static constexpr max_threads_per_multi_processor_t max_threads_per_multi_processor{};
+  using max_threads_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
+  static constexpr max_threads_per_multiprocessor_t max_threads_per_multiprocessor{};
 
   // true if the device shares a unified address space with the host, or false
   // if not
@@ -644,7 +651,7 @@ struct device::attrs
   static constexpr direct_managed_mem_access_from_host_t direct_managed_mem_access_from_host{};
 
   // Maximum per block shared memory size on the device. This value can be opted
-  // into when using cudaFuncSetAttribute
+  // into when using dynamic_shared_memory with NonPortableSize set to true
   using max_shared_memory_per_block_optin_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
   static constexpr max_shared_memory_per_block_optin_t max_shared_memory_per_block_optin{};
 
@@ -715,7 +722,21 @@ struct device::attrs
   static constexpr numa_id_t numa_id{};
 
 #endif // CUDART_VERSION >= 12020
+
+  // Combines major and minor compute capability in a 100 * major + 10 * minor format, allows to query full compute
+  // capability in a single query
+  struct compute_capability_t
+  {
+    _CCCL_NODISCARD int operator()(device_ref __dev_id) const
+    {
+      return 100 * compute_capability_major(__dev_id) + 10 * compute_capability_minor(__dev_id);
+    }
+  };
+  static constexpr compute_capability_t compute_capability{};
 };
+
+} // namespace detail
+
 } // namespace cuda::experimental
 
 #endif // _CUDAX__DEVICE_ATTRIBUTES_
diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh
index 145ce4c10e..4d7212ad8d 100644
--- a/cudax/include/cuda/experimental/__device/device.cuh
+++ b/cudax/include/cuda/experimental/__device/device.cuh
@@ -23,6 +23,8 @@
 
 #include <cuda.h>
 
+#include <cuda/experimental/__device/arch_traits.cuh>
+#include <cuda/experimental/__device/attributes.cuh>
 #include <cuda/experimental/__device/device_ref.cuh>
 #include <cuda/experimental/__utility/driver_api.cuh>
 
@@ -39,9 +41,9 @@ struct __emplace_device
 {
   int __id_;
 
-  _CCCL_NODISCARD operator device() const noexcept;
+  _CCCL_NODISCARD operator device() const;
 
-  _CCCL_NODISCARD constexpr const __emplace_device* operator->() const noexcept;
+  _CCCL_NODISCARD constexpr const __emplace_device* operator->() const;
 };
 } // namespace detail
 
@@ -52,7 +54,7 @@ struct __emplace_device
 class device : public device_ref
 {
 public:
-  struct attrs;
+  using attrs = detail::__device_attrs;
 
   //! @brief For a given attribute, returns the type of the attribute value.
   //!
@@ -70,12 +72,17 @@ public:
 #  if defined(_CCCL_COMPILER_MSVC)
   // When __EDG__ is defined, std::construct_at will not permit constructing
   // a device object from an __emplace_device object. This is a workaround.
-  constexpr device(detail::__emplace_device __ed) noexcept
+  device(detail::__emplace_device __ed)
       : device(__ed.__id_)
   {}
 #  endif
 #endif
 
+  const arch_traits_t& arch_traits() const noexcept
+  {
+    return __traits;
+  }
+
   CUcontext primary_context() const
   {
     ::std::call_once(__init_once, [this]() {
@@ -105,11 +112,14 @@ private:
   mutable CUdevice __device{};
   mutable ::std::once_flag __init_once;
 
-  // TODO: put a mutable thread-safe (or thread_local) cache of device
-  // properties here.
+  // TODO should this be a reference/pointer to the constexpr traits instances?
+  //  Do we care about lazy init?
+  //  We should have some of the attributes just return from the arch traits
+  arch_traits_t __traits;
 
-  explicit constexpr device(int __id) noexcept
+  explicit device(int __id)
       : device_ref(__id)
+      , __traits(detail::__arch_traits_might_be_unknown(__id, attrs::compute_capability(__id)))
   {}
 
   // `device` objects are not movable or copyable.
@@ -129,12 +139,12 @@ private:
 
 namespace detail
 {
-_CCCL_NODISCARD inline __emplace_device::operator device() const noexcept
+_CCCL_NODISCARD inline __emplace_device::operator device() const
 {
   return device(__id_);
 }
 
-_CCCL_NODISCARD inline constexpr const __emplace_device* __emplace_device::operator->() const noexcept
+_CCCL_NODISCARD inline constexpr const __emplace_device* __emplace_device::operator->() const
 {
   return this;
 }
diff --git a/cudax/include/cuda/experimental/__device/device_ref.cuh b/cudax/include/cuda/experimental/__device/device_ref.cuh
index 5c7b89779e..a4319bab92 100644
--- a/cudax/include/cuda/experimental/__device/device_ref.cuh
+++ b/cudax/include/cuda/experimental/__device/device_ref.cuh
@@ -26,6 +26,7 @@
 namespace cuda::experimental
 {
 class device;
+struct arch_traits_t;
 
 namespace detail
 {
@@ -90,12 +91,10 @@ public:
   //! @throws cuda_error if the attribute query fails
   //!
   //! @sa device::attrs
-  template <::cudaDeviceAttr _Attr>
-  _CCCL_NODISCARD auto attr([[maybe_unused]] detail::__dev_attr<_Attr> __attr) const
+  template <typename _Attr>
+  _CCCL_NODISCARD auto attr(_Attr __attr) const
   {
-    int __value = 0;
-    _CCCL_TRY_CUDA_API(::cudaDeviceGetAttribute, "failed to get device attribute", &__value, _Attr, get());
-    return static_cast<typename detail::__dev_attr<_Attr>::type>(__value);
+    return __attr(*this);
   }
 
   //! @overload
@@ -104,6 +103,8 @@ public:
   {
     return attr(detail::__dev_attr<_Attr>());
   }
+
+  const arch_traits_t& arch_traits() const;
 };
 
 } // namespace cuda::experimental
diff --git a/cudax/include/cuda/experimental/__event/event_ref.cuh b/cudax/include/cuda/experimental/__event/event_ref.cuh
index bf1c1b398c..76c9444cde 100644
--- a/cudax/include/cuda/experimental/__event/event_ref.cuh
+++ b/cudax/include/cuda/experimental/__event/event_ref.cuh
@@ -90,6 +90,29 @@ public:
     _CCCL_TRY_CUDA_API(::cudaEventSynchronize, "Failed to wait for CUDA event", __event_);
   }
 
+  //! @brief Checks if all the work in the stream prior to the record of the event has completed.
+  //!
+  //! If is_done returns true, calling wait() on this event will return immediately
+  //!
+  //! @throws cuda_error if the event query fails
+  _CCCL_NODISCARD bool is_done() const
+  {
+    assert(__event_ != nullptr);
+    cudaError_t __status = ::cudaEventQuery(__event_);
+    if (__status == cudaSuccess)
+    {
+      return true;
+    }
+    else if (__status == cudaErrorNotReady)
+    {
+      return false;
+    }
+    else
+    {
+      ::cuda::__throw_cuda_error(__status, "Failed to query CUDA event");
+    }
+  }
+
   //! @brief Retrieve the native `cudaEvent_t` handle.
   //!
   //! @return cudaEvent_t The native handle being held by the event_ref object.
diff --git a/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh
index 7406e1648d..ecb0f8c6d6 100644
--- a/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh
+++ b/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh
@@ -58,14 +58,26 @@ struct hierarchy_query_result : public dimensions<T, Extents...>
 {
   using Dims = dimensions<T, Extents...>;
   using Dims::Dims;
+
+  _CCCL_HOST_DEVICE constexpr hierarchy_query_result()
+      : Dims()
+      , x(Dims::extent(0))
+      , y(Dims::rank() > 1 ? Dims::extent(1) : 1)
+      , z(Dims::rank() > 2 ? Dims::extent(2) : 1)
+  {}
+
   _CCCL_HOST_DEVICE explicit constexpr hierarchy_query_result(const Dims& dims)
       : Dims(dims)
+      , x(Dims::extent(0))
+      , y(Dims::rank() > 1 ? Dims::extent(1) : 1)
+      , z(Dims::rank() > 2 ? Dims::extent(2) : 1)
   {}
+
   static_assert(Dims::rank() > 0 && Dims::rank() <= 3);
 
-  const T x = Dims::extent(0);
-  const T y = Dims::rank() > 1 ? Dims::extent(1) : 1;
-  const T z = Dims::rank() > 2 ? Dims::extent(2) : 1;
+  const T x;
+  const T y;
+  const T z;
 
   _CCCL_HOST_DEVICE constexpr operator dim3() const
   {
diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh
index f4aee8a173..8acdb88639 100644
--- a/cudax/include/cuda/experimental/__launch/launch.cuh
+++ b/cudax/include/cuda/experimental/__launch/launch.cuh
@@ -41,7 +41,7 @@ template <typename Config, typename Kernel, typename... Args>
 _CCCL_NODISCARD cudaError_t
 launch_impl(::cuda::stream_ref stream, Config conf, const Kernel& kernel_fn, const Args&... args)
 {
-  cudaLaunchConfig_t config               = {0};
+  cudaLaunchConfig_t config{};
   cudaError_t status                      = cudaSuccess;
   constexpr bool has_cluster_level        = has_level<cluster_level, decltype(conf.dims)>;
   constexpr unsigned int num_attrs_needed = detail::kernel_config_count_attr_space(conf) + has_cluster_level;
diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
new file mode 100644
index 0000000000..eafd626739
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
@@ -0,0 +1,279 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__MEMORY_RESOURCE_ANY_RESOURCE_H
+#define _CUDAX__MEMORY_RESOURCE_ANY_RESOURCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// If the memory resource header was included without the experimental flag,
+// tell the user to define the experimental flag.
+#if defined(_CUDA_MEMORY_RESOURCE) && !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#  error "To use the experimental memory resource, define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
+#endif
+
+// cuda::mr is unavable on MSVC 2017
+#if defined(_CCCL_COMPILER_MSVC_2017)
+#  error "The any_resource header is not supported on MSVC 2017"
+#endif
+
+#if !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#  define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif
+
+#include <cuda/__memory_resource/get_property.h>
+#include <cuda/__memory_resource/resource.h>
+#include <cuda/__memory_resource/resource_ref.h>
+#include <cuda/std/__concepts/__concept_macros.h>
+#include <cuda/std/__concepts/_One_of.h>
+#include <cuda/std/__concepts/all_of.h>
+#include <cuda/std/__type_traits/is_nothrow_constructible.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__utility/exchange.h>
+#include <cuda/std/__utility/forward.h>
+
+namespace cuda::experimental::mr
+{
+template <class _Ty, class _Uy = _CUDA_VSTD::remove_cvref_t<_Ty>>
+_LIBCUDACXX_INLINE_VAR constexpr bool __is_basic_any_resource = false;
+
+//! @rst
+//! .. _cudax-memory-resource-basic-any-resource:
+//!
+//! Base class for a type erased owning wrapper around a memory resource
+//! ---------------------------------------------------------------------
+//!
+//! ``basic_any_resource`` abstracts the differences between a resource and an async resource away, allowing efficient
+//! interoperability between the two.
+//!
+//! @endrst
+template <_CUDA_VMR::_AllocType _Alloc_type, class... _Properties>
+class basic_any_resource
+    : public _CUDA_VMR::_Resource_base<_Alloc_type, _CUDA_VMR::_WrapperType::_Owning>
+    , private _CUDA_VMR::_Filtered_vtable<_Properties...>
+{
+private:
+  template <_CUDA_VMR::_AllocType, class...>
+  friend class basic_any_resource;
+
+  template <class...>
+  friend struct _CUDA_VMR::_Resource_vtable;
+
+  using __vtable = _CUDA_VMR::_Filtered_vtable<_Properties...>;
+
+  template <class... _OtherProperties>
+  static constexpr bool __properties_match =
+    _CUDA_VSTD::__type_set_contains<_CUDA_VSTD::__make_type_set<_OtherProperties...>, _Properties...>;
+
+public:
+  //! @brief Constructs a \c basic_any_resource from a type that satisfies the \c resource or \c async_resource
+  //! concept as well as all properties.
+  //! @param __res The resource to be wrapped within the \c basic_any_resource.
+  _LIBCUDACXX_TEMPLATE(class _Resource, class __resource_t = _CUDA_VSTD::remove_cvref_t<_Resource>)
+  _LIBCUDACXX_REQUIRES(
+    (!__is_basic_any_resource<_Resource>) _LIBCUDACXX_AND(_CUDA_VMR::resource_with<__resource_t, _Properties...>)
+      _LIBCUDACXX_AND(_Alloc_type != _CUDA_VMR::_AllocType::_Async
+                      || (_CUDA_VMR::async_resource_with<__resource_t, _Properties...>) ))
+  basic_any_resource(_Resource&& __res) noexcept
+      : _CUDA_VMR::_Resource_base<_Alloc_type, _CUDA_VMR::_WrapperType::_Owning>(
+          nullptr, &_CUDA_VMR::__alloc_vtable<_Alloc_type, _CUDA_VMR::_WrapperType::_Owning, __resource_t>)
+      , __vtable(__vtable::template _Create<__resource_t>())
+  {
+    if constexpr (_CUDA_VMR::_IsSmall<__resource_t>())
+    {
+      ::new (static_cast<void*>(this->__object.__buf_)) __resource_t(_CUDA_VSTD::forward<_Resource>(__res));
+    }
+    else
+    {
+      this->__object.__ptr_ = new __resource_t(_CUDA_VSTD::forward<_Resource>(__res));
+    }
+  }
+
+  //! @brief Conversion from a \c basic_any_resource with the same set of properties but in a different order.
+  //! This constructor also handles conversion from \c async_any_resource to \c any_resource
+  //! @param __other The other \c basic_any_resource.
+  _LIBCUDACXX_TEMPLATE(_CUDA_VMR::_AllocType _OtherAllocType, class... _OtherProperties)
+  _LIBCUDACXX_REQUIRES(
+    (_CUDA_VSTD::_IsNotSame<basic_any_resource, basic_any_resource<_OtherAllocType, _OtherProperties...>>::value)
+      _LIBCUDACXX_AND(_OtherAllocType == _Alloc_type || _OtherAllocType == _CUDA_VMR::_AllocType::_Async)
+        _LIBCUDACXX_AND __properties_match<_OtherProperties...>)
+  basic_any_resource(basic_any_resource<_OtherAllocType, _OtherProperties...> __other) noexcept
+      : _CUDA_VMR::_Resource_base<_Alloc_type, _CUDA_VMR::_WrapperType::_Owning>(
+          nullptr, _CUDA_VSTD::exchange(__other.__static_vtable, nullptr))
+      , __vtable(__other)
+  {
+    _LIBCUDACXX_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
+    this->__static_vtable->__move_fn(&this->__object, &__other.__object);
+  }
+
+  //! @brief Move-constructs a \c basic_any_resource from another one, taking ownership of the stored resource.
+  //! @param __other The other \c basic_any_resource.
+  basic_any_resource(basic_any_resource&& __other) noexcept
+      : _CUDA_VMR::_Resource_base<_Alloc_type, _CUDA_VMR::_WrapperType::_Owning>(
+          nullptr, _CUDA_VSTD::exchange(__other.__static_vtable, nullptr))
+      , __vtable(__other)
+  {
+    _LIBCUDACXX_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
+    this->__static_vtable->__move_fn(&this->__object, &__other.__object);
+  }
+
+  //! @brief Move-assigns another \c basic_any_resource, taking ownership of the stored resource.
+  //! @param __other The other \c basic_any_resource.
+  basic_any_resource& operator=(basic_any_resource&& __other) noexcept
+  {
+    if (this->__static_vtable != nullptr)
+    {
+      this->__static_vtable->__destroy_fn(&this->__object);
+      this->__static_vtable = nullptr;
+    }
+
+    if (__other.__static_vtable != nullptr)
+    {
+      this->__static_vtable = _CUDA_VSTD::exchange(__other.__static_vtable, nullptr);
+      this->__static_vtable->__move_fn(&this->__object, &__other.__object);
+    }
+
+    return *this;
+  }
+
+  //! @brief Copy-constructs a \c basic_any_resource from another one.
+  //! @param __other The other \c basic_any_resource.
+  basic_any_resource(const basic_any_resource& __other)
+      : _CUDA_VMR::_Resource_base<_Alloc_type, _CUDA_VMR::_WrapperType::_Owning>(nullptr, __other.__static_vtable)
+      , __vtable(__other)
+  {
+    _LIBCUDACXX_ASSERT(this->__static_vtable != nullptr, "copying from a moved-from object");
+    this->__static_vtable->__copy_fn(&this->__object, &__other.__object);
+  }
+
+  //! @brief Copy-assigns another \c basic_any_resource.
+  //! @param __other The other \c basic_any_resource.
+  basic_any_resource& operator=(const basic_any_resource& __other)
+  {
+    return this == &__other ? *this : operator=(basic_any_resource(__other));
+  }
+
+  //! @brief Destroys the stored resource
+  ~basic_any_resource() noexcept
+  {
+    if (this->__static_vtable != nullptr)
+    {
+      this->__static_vtable->__destroy_fn(&this->__object);
+    }
+  }
+
+  //! @brief Converts a \c basic_any_resource to a \c resource_ref with a potential subset of properties.
+  //! @return The \c resource_ref to this resource.
+  _LIBCUDACXX_TEMPLATE(_CUDA_VMR::_AllocType _OtherAllocType, class... _OtherProperties)
+  _LIBCUDACXX_REQUIRES(
+    (_OtherAllocType == _CUDA_VMR::_AllocType::_Default || _OtherAllocType == _Alloc_type)
+      _LIBCUDACXX_AND(_CUDA_VSTD::__type_set_contains<_CUDA_VSTD::__make_type_set<_Properties...>, _OtherProperties...>))
+  operator _CUDA_VMR::basic_resource_ref<_OtherAllocType, _OtherProperties...>() noexcept
+  {
+    return _CUDA_VMR::_Resource_ref_helper::_Construct<_Alloc_type, _OtherProperties...>(
+      this->_Get_object(), this->__static_vtable, static_cast<const __vtable&>(*this));
+  }
+
+  //! @brief Swaps a \c basic_any_resource with another one.
+  //! @param __other The other \c basic_any_resource.
+  void swap(basic_any_resource& __other) noexcept
+  {
+    auto __tmp = _CUDA_VSTD::move(__other);
+    __other    = _CUDA_VSTD::move(*this);
+    *this      = _CUDA_VSTD::move(__tmp);
+  }
+
+  //! @brief Equality comparison between two \c basic_any_resource
+  //! @param __rhs The other \c basic_any_resource
+  //! @return Checks whether both resources have the same equality function stored in their vtable and if so returns
+  //! the result of that equality comparison. Otherwise returns false.
+  _LIBCUDACXX_TEMPLATE(class... _OtherProperties)
+  _LIBCUDACXX_REQUIRES((sizeof...(_Properties) == sizeof...(_OtherProperties))
+                         _LIBCUDACXX_AND __properties_match<_OtherProperties...>)
+  _CCCL_NODISCARD bool operator==(const basic_any_resource<_Alloc_type, _OtherProperties...>& __rhs) const
+  {
+    return (this->__static_vtable->__equal_fn == __rhs.__static_vtable->__equal_fn)
+        && this->__static_vtable->__equal_fn(this->_Get_object(), __rhs._Get_object());
+  }
+
+  //! @brief Inequality comparison between two \c basic_any_resource
+  //! @param __rhs The other \c basic_any_resource
+  //! @return Checks whether both resources have the same equality function stored in their vtable and if so returns
+  //! the inverse result of that equality comparison. Otherwise returns true.
+  _LIBCUDACXX_TEMPLATE(class... _OtherProperties)
+  _LIBCUDACXX_REQUIRES((sizeof...(_Properties) == sizeof...(_OtherProperties))
+                         _LIBCUDACXX_AND __properties_match<_OtherProperties...>)
+  _CCCL_NODISCARD bool operator!=(const basic_any_resource<_Alloc_type, _OtherProperties...>& __rhs) const
+  {
+    return !(*this == __rhs);
+  }
+
+  //! @brief Forwards the stateless properties
+  _LIBCUDACXX_TEMPLATE(class _Property)
+  _LIBCUDACXX_REQUIRES((!property_with_value<_Property>) _LIBCUDACXX_AND(_CUDA_VSTD::_One_of<_Property, _Properties...>))
+  friend void get_property(const basic_any_resource&, _Property) noexcept {}
+
+  //! @brief Forwards the stateful properties
+  _LIBCUDACXX_TEMPLATE(class _Property)
+  _LIBCUDACXX_REQUIRES(property_with_value<_Property> _LIBCUDACXX_AND(_CUDA_VSTD::_One_of<_Property, _Properties...>))
+  _CCCL_NODISCARD_FRIEND __property_value_t<_Property> get_property(const basic_any_resource& __res, _Property) noexcept
+  {
+    _CUDA_VMR::_Property_vtable<_Property> const& __prop = __res;
+    return __prop.__property_fn(__res._Get_object());
+  }
+};
+
+//! @brief Checks whether a passed in type is a specialization of basic_any_resource
+template <class _Ty, _CUDA_VMR::_AllocType _Alloc_type, class... _Properties>
+_LIBCUDACXX_INLINE_VAR constexpr bool __is_basic_any_resource<_Ty, basic_any_resource<_Alloc_type, _Properties...>> =
+  true;
+
+//! @rst
+//! .. _cudax-memory-resource-any-resource:
+//!
+//! Type erased wrapper around a `resource`
+//! ----------------------------------------
+//!
+//! ``any_resource`` wraps any given :ref:`resource <libcudacxx-extended-api-memory-resources-resource>` that
+//! satisfies the required properties. It owns the contained resource, taking care of construction / destruction.
+//! This makes it especially suited for use in e.g. container types that need to ensure that the lifetime of the
+//! container exceeds the lifetime of the memory resource used to allocate the storage
+//!
+//! @endrst
+template <class... _Properties>
+using any_resource = basic_any_resource<_CUDA_VMR::_AllocType::_Default, _Properties...>;
+
+//! @rst
+//! .. _cudax-memory-resource-async-any-resource:
+//!
+//! Type erased wrapper around an `async_resource`
+//! -----------------------------------------------
+//!
+//! ``async_any_resource`` wraps any given :ref:`async resource <libcudacxx-extended-api-memory-resources-resource>`
+//! that satisfies the required properties. It owns the contained resource, taking care of construction / destruction.
+//! This makes it especially suited for use in e.g. container types that need to ensure that the lifetime of the
+//! container exceeds the lifetime of the memory resource used to allocate the storage
+//!
+//! @endrst
+template <class... _Properties>
+using async_any_resource = basic_any_resource<_CUDA_VMR::_AllocType::_Async, _Properties...>;
+
+} // namespace cuda::experimental::mr
+
+#endif //_CUDAX__MEMORY_RESOURCE_ANY_RESOURCE_H
diff --git a/cudax/include/cuda/experimental/__memory_resource/async_memory_pool.cuh b/cudax/include/cuda/experimental/__memory_resource/async_memory_pool.cuh
new file mode 100644
index 0000000000..9663eed297
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory_resource/async_memory_pool.cuh
@@ -0,0 +1,306 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__MEMORY_RESOURCE_CUDA_MEMORY_POOL
+#define _CUDAX__MEMORY_RESOURCE_CUDA_MEMORY_POOL
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// cudaMallocAsync was introduced in CTK 11.2
+#if !defined(_CCCL_COMPILER_MSVC_2017) && !defined(_CCCL_CUDACC_BELOW_11_2)
+
+#  if !defined(_CCCL_CUDA_COMPILER_NVCC) && !defined(_CCCL_CUDA_COMPILER_NVHPC)
+#    include <cuda_runtime.h>
+#    include <cuda_runtime_api.h>
+#  endif // !_CCCL_CUDA_COMPILER_NVCC && !_CCCL_CUDA_COMPILER_NVHPC
+
+#  include <cuda/__memory_resource/get_property.h>
+#  include <cuda/__memory_resource/properties.h>
+#  include <cuda/__memory_resource/resource_ref.h>
+#  include <cuda/std/__cuda/api_wrapper.h>
+#  include <cuda/std/__new_>
+#  include <cuda/stream_ref>
+
+#  include <cuda/experimental/__stream/stream.cuh>
+
+#  if _CCCL_STD_VER >= 2014
+
+//! @file
+//! The \c async_memory_pool class provides a wrapper around a `cudaMempool_t`.
+namespace cuda::experimental::mr
+{
+
+//! @brief  Checks whether the current device supports \c cudaMallocAsync.
+//! @param __device_id The id of the device for which to query support.
+//! @throws cuda_error if \c cudaDeviceGetAttribute failed.
+//! @returns true if \c cudaDevAttrMemoryPoolsSupported is not zero.
+inline void __device_supports_stream_ordered_allocations(const int __device_id)
+{
+  int __pool_is_supported = 0;
+  _CCCL_TRY_CUDA_API(
+    ::cudaDeviceGetAttribute,
+    "Failed to call cudaDeviceGetAttribute",
+    &__pool_is_supported,
+    ::cudaDevAttrMemoryPoolsSupported,
+    __device_id);
+  if (__pool_is_supported == 0)
+  {
+    ::cuda::__throw_cuda_error(::cudaErrorNotSupported, "cudaMallocAsync is not supported on the given device");
+  }
+}
+
+//! @brief Internal redefinition of ``cudaMemAllocationHandleType``.
+//! @note We need to define our own enum here because the earliest CUDA runtime version that supports asynchronous
+//! memory pools (CUDA 11.2) did not support these flags. See the <a
+//! href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html">cudaMemAllocationHandleType</a> docs.
+enum class cudaMemAllocationHandleType
+{
+  cudaMemHandleTypeNone                = 0x0, ///< Does not allow any export mechanism.
+  cudaMemHandleTypePosixFileDescriptor = 0x1, ///< Allows a file descriptor to be used for exporting.
+  cudaMemHandleTypeWin32               = 0x2, ///< Allows a Win32 NT handle to be used for exporting. (HANDLE)
+  cudaMemHandleTypeWin32Kmt            = 0x4, ///< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
+  cudaMemHandleTypeFabric = 0x8, ///< Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)
+};
+
+//! @brief \c async_memory_pool_properties is a wrapper around properties passed to \c async_memory_pool to create a
+//! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">cudaMemPool_t</a>.
+struct async_memory_pool_properties
+{
+  size_t initial_pool_size                           = 0;
+  size_t release_threshold                           = 0;
+  cudaMemAllocationHandleType allocation_handle_type = cudaMemAllocationHandleType::cudaMemHandleTypeNone;
+};
+
+//! @brief \c async_memory_pool is an owning wrapper around a
+//! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">cudaMemPool_t</a>.
+//!
+//! It handles creation and destruction of the underlying pool utilizing the provided \c async_memory_pool_properties.
+class async_memory_pool
+{
+private:
+  ::cudaMemPool_t __pool_handle_ = nullptr;
+
+  //! @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present
+  //! CUDA driver/runtime version.
+  //! @note This query was introduced in CUDA 11.3 so on CUDA 11.2 this function will only return
+  //! true for `cudaMemHandleTypeNone`.
+  //! @param __device_id The id of the device to check for support.
+  //! @param __handle_type An IPC export handle type to check for support.
+  //! @return true if the handle type is supported by cudaDevAttrMemoryPoolSupportedHandleTypes.
+  static void __cuda_supports_export_handle_type(const int __device_id, cudaMemAllocationHandleType __handle_type)
+  {
+    int __supported_handles = static_cast<int>(cudaMemAllocationHandleType::cudaMemHandleTypeNone);
+#    if !defined(_CCCL_CUDACC_BELOW_11_3)
+    if (__handle_type != cudaMemAllocationHandleType::cudaMemHandleTypeNone)
+    {
+      const ::cudaError_t __status =
+        ::cudaDeviceGetAttribute(&__supported_handles, ::cudaDevAttrMemoryPoolSupportedHandleTypes, __device_id);
+      // export handle is not supported at all
+      switch (__status)
+      {
+        case ::cudaSuccess:
+          break;
+        case ::cudaErrorInvalidValue:
+          ::cudaGetLastError(); // Clear CUDA error state
+          ::cuda::__throw_cuda_error(
+            ::cudaErrorNotSupported, "Requested IPC memory handle type not supported on given device");
+        default:
+          ::cudaGetLastError(); // Clear CUDA error state
+          ::cuda::__throw_cuda_error(__status, "Failed to call cudaDeviceGetAttribute");
+      }
+    }
+#    endif //_CCCL_CUDACC_BELOW_11_3
+    if ((static_cast<int>(__handle_type) & __supported_handles) != static_cast<int>(__handle_type))
+    {
+      ::cuda::__throw_cuda_error(
+        ::cudaErrorNotSupported, "Requested IPC memory handle type not supported on given device");
+    }
+  }
+
+  //! @brief  Creates the CUDA memory pool from the passed in arguments.
+  //! @throws cuda_error If the creation of the CUDA memory pool failed.
+  //! @returns The created CUDA memory pool.
+  _CCCL_NODISCARD static cudaMemPool_t
+  __create_cuda_mempool(const int __device_id, async_memory_pool_properties __properties) noexcept
+  {
+    ::cuda::experimental::mr::__device_supports_stream_ordered_allocations(__device_id);
+    async_memory_pool::__cuda_supports_export_handle_type(__device_id, __properties.allocation_handle_type);
+
+    ::cudaMemPoolProps __pool_properties{};
+    __pool_properties.allocType     = ::cudaMemAllocationTypePinned;
+    __pool_properties.handleTypes   = ::cudaMemAllocationHandleType(__properties.allocation_handle_type);
+    __pool_properties.location.type = ::cudaMemLocationTypeDevice;
+    __pool_properties.location.id   = __device_id;
+    ::cudaMemPool_t __cuda_pool_handle{};
+    _CCCL_TRY_CUDA_API(::cudaMemPoolCreate, "Failed to call cudaMemPoolCreate", &__cuda_pool_handle, &__pool_properties);
+
+    // CUDA drivers before 11.5 have known incompatibilities with the async allocator.
+    // We'll disable `cudaMemPoolReuseAllowOpportunistic` if cuda driver < 11.5.
+    // See https://github.com/NVIDIA/spark-rapids/issues/4710.
+    int __driver_version = 0;
+    _CCCL_TRY_CUDA_API(::cudaDriverGetVersion, "Failed to call cudaDriverGetVersion", &__driver_version);
+
+    constexpr int __min_async_version = 11050;
+    if (__driver_version < __min_async_version)
+    {
+      int __disable_reuse = 0;
+      _CCCL_TRY_CUDA_API(
+        ::cudaMemPoolSetAttribute,
+        "Failed to call cudaMemPoolSetAttribute with cudaMemPoolReuseAllowOpportunistic",
+        __cuda_pool_handle,
+        ::cudaMemPoolReuseAllowOpportunistic,
+        &__disable_reuse);
+    }
+
+    _CCCL_TRY_CUDA_API(
+      ::cudaMemPoolSetAttribute,
+      "Failed to call cudaMemPoolSetAttribute with cudaMemPoolAttrReleaseThreshold",
+      __cuda_pool_handle,
+      ::cudaMemPoolAttrReleaseThreshold,
+      &__properties.release_threshold);
+
+    // allocate the requested initial size to prime the pool.
+    // We need to use a new stream so we do not wait on other work
+    if (__properties.initial_pool_size != 0)
+    {
+      ::cuda::experimental::stream __temp_stream{__device_id};
+      void* __ptr{nullptr};
+      _CCCL_TRY_CUDA_API(
+        ::cudaMallocAsync,
+        "async_memory_pool failed to allocate the initial pool size",
+        &__ptr,
+        __properties.initial_pool_size,
+        __temp_stream.get());
+
+      _CCCL_ASSERT_CUDA_API(
+        ::cudaFreeAsync, "async_memory_pool failed to free the initial pool allocation", __ptr, __temp_stream.get());
+    }
+    return __cuda_pool_handle;
+  }
+
+  struct __from_handle_t
+  {};
+
+  //! @brief Constructs a \c async_memory_pool from a handle taking ownership of the pool
+  //! @param __handle The handle to the existing pool
+  explicit async_memory_pool(__from_handle_t, ::cudaMemPool_t __handle) noexcept
+      : __pool_handle_(__handle)
+  {}
+
+public:
+  //! @brief Constructs a \c async_memory_pool with the optionally specified initial pool size and release threshold.
+  //! If the pool size grows beyond the release threshold, unused memory held by the pool will be released at the next
+  //! synchronization event.
+  //! @throws cuda_error if the CUDA version does not support ``cudaMallocAsync``.
+  //! @param __device_id The device id of the device the stream pool is constructed on.
+  //! @param __pool_properties Optional, additional properties of the pool to be created.
+  explicit async_memory_pool(const ::cuda::experimental::device_ref __device_id,
+                             async_memory_pool_properties __properties = {})
+      : __pool_handle_(__create_cuda_mempool(__device_id.get(), __properties))
+  {}
+
+  //! @brief Disables construction from a plain `cudaMemPool_t`. We want to ensure clean ownership semantics.
+  async_memory_pool(::cudaMemPool_t) = delete;
+
+  async_memory_pool(async_memory_pool const&)            = delete;
+  async_memory_pool(async_memory_pool&&)                 = delete;
+  async_memory_pool& operator=(async_memory_pool const&) = delete;
+  async_memory_pool& operator=(async_memory_pool&&)      = delete;
+
+  //! @brief Destroys the \c async_memory_pool by releasing the internal ``cudaMemPool_t``.
+  ~async_memory_pool() noexcept
+  {
+    _CCCL_ASSERT_CUDA_API(::cudaMemPoolDestroy, "~async_memory_pool() failed to destroy pool", __pool_handle_);
+  }
+
+  //! @brief Equality comparison with another \c async_memory_pool.
+  //! @returns true if the stored ``cudaMemPool_t`` are equal.
+  _CCCL_NODISCARD constexpr bool operator==(async_memory_pool const& __rhs) const noexcept
+  {
+    return __pool_handle_ == __rhs.__pool_handle_;
+  }
+
+#    if _CCCL_STD_VER <= 2017
+  //! @brief Inequality comparison with another \c async_memory_pool.
+  //! @returns true if the stored ``cudaMemPool_t`` are not equal.
+  _CCCL_NODISCARD constexpr bool operator!=(async_memory_pool const& __rhs) const noexcept
+  {
+    return __pool_handle_ != __rhs.__pool_handle_;
+  }
+#    endif // _CCCL_STD_VER <= 2017
+
+  //! @brief Equality comparison with a \c cudaMemPool_t.
+  //! @param __rhs A \c cudaMemPool_t.
+  //! @returns true if the stored ``cudaMemPool_t`` is equal to \p __rhs.
+  _CCCL_NODISCARD_FRIEND constexpr bool operator==(async_memory_pool const& __lhs, ::cudaMemPool_t __rhs) noexcept
+  {
+    return __lhs.__pool_handle_ == __rhs;
+  }
+
+#    if _CCCL_STD_VER <= 2017
+  //! @copydoc async_memory_pool::operator==(async_memory_pool const&, ::cudaMemPool_t)
+  _CCCL_NODISCARD_FRIEND constexpr bool operator==(::cudaMemPool_t __lhs, async_memory_pool const& __rhs) noexcept
+  {
+    return __rhs.__pool_handle_ == __lhs;
+  }
+
+  //! @copydoc async_memory_pool::operator==(async_memory_pool const&, ::cudaMemPool_t)
+  _CCCL_NODISCARD_FRIEND constexpr bool operator!=(async_memory_pool const& __lhs, ::cudaMemPool_t __rhs) noexcept
+  {
+    return __lhs.__pool_handle_ != __rhs;
+  }
+
+  //! @copydoc async_memory_pool::operator==(async_memory_pool const&, ::cudaMemPool_t)
+  _CCCL_NODISCARD_FRIEND constexpr bool operator!=(::cudaMemPool_t __lhs, async_memory_pool const& __rhs) noexcept
+  {
+    return __rhs.__pool_handle_ != __lhs;
+  }
+#    endif // _CCCL_STD_VER <= 2017
+
+  //! @brief Returns the underlying handle to the CUDA memory pool.
+  _CCCL_NODISCARD constexpr cudaMemPool_t get() const noexcept
+  {
+    return __pool_handle_;
+  }
+
+  //! @brief Construct an `async_memory_pool` object from a native `cudaMemPool_t` handle.
+  //!
+  //! @param __handle The native handle
+  //!
+  //! @return The constructed `async_memory_pool` object
+  //!
+  //! @note The constructed `async_memory_pool` object takes ownership of the native handle.
+  _CCCL_NODISCARD static async_memory_pool from_native_handle(::cudaMemPool_t __handle) noexcept
+  {
+    return async_memory_pool(__from_handle_t{}, __handle);
+  }
+
+  // Disallow construction from an `int`, e.g., `0`.
+  static async_memory_pool from_native_handle(int) = delete;
+
+  // Disallow construction from `nullptr`.
+  static async_memory_pool from_native_handle(_CUDA_VSTD::nullptr_t) = delete;
+};
+
+} // namespace cuda::experimental::mr
+
+#  endif // _CCCL_STD_VER >= 2014
+
+#endif // !_CCCL_COMPILER_MSVC_2017 && !_CCCL_CUDACC_BELOW_11_2
+
+#endif // _CUDAX__MEMORY_RESOURCE_CUDA_MEMORY_POOL
diff --git a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh
new file mode 100644
index 0000000000..fb2326dfab
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh
@@ -0,0 +1,322 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__MEMORY_RESOURCE_CUDA_ASYNC_MEMORY_RESOURCE
+#define _CUDAX__MEMORY_RESOURCE_CUDA_ASYNC_MEMORY_RESOURCE
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// cudaMallocAsync was introduced in CTK 11.2
+#if !defined(_CCCL_COMPILER_MSVC_2017) && !defined(_CCCL_CUDACC_BELOW_11_2)
+
+#  if !defined(_CCCL_CUDA_COMPILER_NVCC) && !defined(_CCCL_CUDA_COMPILER_NVHPC)
+#    include <cuda_runtime.h>
+#    include <cuda_runtime_api.h>
+#  endif // !_CCCL_CUDA_COMPILER_NVCC && !_CCCL_CUDA_COMPILER_NVHPC
+
+#  include <cuda/__memory_resource/get_property.h>
+#  include <cuda/__memory_resource/properties.h>
+#  include <cuda/__memory_resource/resource_ref.h>
+#  include <cuda/std/__concepts/__concept_macros.h>
+#  include <cuda/std/__cuda/api_wrapper.h>
+#  include <cuda/std/__new_>
+#  include <cuda/std/cstddef>
+#  include <cuda/stream_ref>
+
+#  include <cuda/experimental/__device/device_ref.cuh>
+#  include <cuda/experimental/__memory_resource/async_memory_pool.cuh>
+#  include <cuda/experimental/__stream/stream.cuh>
+
+#  if _CCCL_STD_VER >= 2014
+
+namespace cuda::experimental::mr
+{
+
+//! @brief global stream to synchronize in the synchronous interface of \c async_memory_resource
+inline ::cuda::stream_ref __async_memory_resource_sync_stream()
+{
+  static ::cuda::experimental::stream __stream{};
+  return __stream;
+}
+
+//! @rst
+//! .. _cudax-memory-resource-async:
+//!
+//! Stream ordered memory resource
+//! ------------------------------
+//!
+//! ``async_memory_resource`` uses `cudaMallocFromPoolAsync / cudaFreeAsync
+//! <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__ for allocation/deallocation. A
+//! ``async_memory_resource`` is a thin wrapper around a \c cudaMemPool_t.
+//!
+//! .. warning::
+//!
+//!    ``async_memory_resource`` does not own the pool and it is the responsibility of the user to ensure that the
+//!    lifetime of the pool exceeds the lifetime of the ``async_memory_resource``.
+//!
+//! @endrst
+class async_memory_resource
+{
+private:
+  ::cudaMemPool_t __pool_;
+
+  //! @brief Checks whether the passed in alignment is valid.
+  //! @param __alignment the alignment to check.
+  //! @returns true if \p __alignment is valid.
+  _CCCL_NODISCARD static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
+  {
+    return __alignment <= _CUDA_VMR::default_cuda_malloc_alignment
+        && (_CUDA_VMR::default_cuda_malloc_alignment % __alignment == 0);
+  }
+
+  //! @brief  Returns the default ``cudaMemPool_t`` from the specified device.
+  //! @throws cuda_error if retrieving the default ``cudaMemPool_t`` fails.
+  //! @returns The default memory pool of the specified device.
+  _CCCL_NODISCARD static ::cudaMemPool_t __get_default_mem_pool(const int __device_id)
+  {
+    ::cuda::experimental::mr::__device_supports_stream_ordered_allocations(__device_id);
+
+    ::cudaMemPool_t __pool;
+    _CCCL_TRY_CUDA_API(
+      ::cudaDeviceGetDefaultMemPool, "Failed to call cudaDeviceGetDefaultMemPool", &__pool, __device_id);
+    return __pool;
+  }
+
+public:
+  //! @brief Default constructs the async_memory_resource using the default \c cudaMemPool_t of the default device.
+  //! @throws cuda_error if retrieving the default \c cudaMemPool_t fails.
+  async_memory_resource()
+      : __pool_(__get_default_mem_pool(0))
+  {}
+
+  //! @brief Constructs a async_memory_resource using the default \c cudaMemPool_t of a given device.
+  //! @throws cuda_error if retrieving the default \c cudaMemPool_t fails.
+  explicit async_memory_resource(::cuda::experimental::device_ref __device)
+      : __pool_(__get_default_mem_pool(__device.get()))
+  {}
+
+  async_memory_resource(int)                   = delete;
+  async_memory_resource(_CUDA_VSTD::nullptr_t) = delete;
+
+  //! @brief  Constructs the async_memory_resource from a \c cudaMemPool_t.
+  //! @param __pool The \c cudaMemPool_t used to allocate memory.
+  explicit async_memory_resource(::cudaMemPool_t __pool) noexcept
+      : __pool_(__pool)
+  {}
+
+  //! @brief  Constructs the async_memory_resource from a \c async_memory_pool by calling get().
+  //! @param __pool The \c async_memory_pool used to allocate memory.
+  explicit async_memory_resource(async_memory_pool& __pool) noexcept
+      : __pool_(__pool.get())
+  {}
+
+  //! @brief Allocate device memory of size at least \p __bytes via cudaMallocFromPoolAsync.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @throws std::invalid_argument In case of invalid alignment.
+  //! @throws cuda::cuda_error If an error code was return by the CUDA API call.
+  //! @returns Pointer to the newly allocated memory.
+  _CCCL_NODISCARD void* allocate(const size_t __bytes,
+                                 const size_t __alignment = _CUDA_VMR::default_cuda_malloc_alignment)
+  {
+    if (!__is_valid_alignment(__alignment))
+    {
+      _CUDA_VSTD_NOVERSION::__throw_invalid_argument(
+        "Invalid alignment passed to "
+        "async_memory_resource::allocate_async.");
+    }
+
+    void* __ptr{nullptr};
+    _CCCL_TRY_CUDA_API(
+      ::cudaMallocFromPoolAsync,
+      "async_memory_resource::allocate failed to allocate with cudaMallocFromPoolAsync",
+      &__ptr,
+      __bytes,
+      __pool_,
+      __async_memory_resource_sync_stream().get());
+    __async_memory_resource_sync_stream().wait();
+    return __ptr;
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`.
+  //! @param __bytes  The number of bytes that was passed to the `allocate` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
+  //! @note The pointer passed to `deallocate` must not be in use in a stream. It is the caller's responsibility to
+  //! properly synchronize all relevant streams before calling `deallocate`.
+  void deallocate(void* __ptr, const size_t, const size_t __alignment = _CUDA_VMR::default_cuda_malloc_alignment)
+  {
+    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
+                       "Invalid alignment passed to async_memory_resource::deallocate.");
+    _CCCL_ASSERT_CUDA_API(
+      ::cudaFreeAsync, "async_memory_resource::deallocate failed", __ptr, __async_memory_resource_sync_stream().get());
+    __async_memory_resource_sync_stream().wait();
+    (void) __alignment;
+  }
+
+  //! @brief Allocate device memory of size at least \p __bytes via `cudaMallocFromPoolAsync`.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @param __stream Stream on which to perform allocation.
+  //! @throws std::invalid_argument In case of invalid alignment.
+  //! @throws cuda::cuda_error If an error code was return by the cuda api call.
+  //! @returns Pointer to the newly allocated memory.
+  _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream)
+  {
+    if (!__is_valid_alignment(__alignment))
+    {
+      _CUDA_VSTD_NOVERSION::__throw_invalid_argument(
+        "Invalid alignment passed to "
+        "async_memory_resource::allocate_async.");
+    }
+
+    return allocate_async(__bytes, __stream);
+  }
+
+  //! @brief Allocate device memory of size at least \p __bytes via cudaMallocFromPoolAsync.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __stream Stream on which to perform allocation.
+  //! @throws cuda::cuda_error If an error code was return by the cuda api call.
+  //! @returns Pointer to the newly allocated memory.
+  _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const ::cuda::stream_ref __stream)
+  {
+    void* __ptr{nullptr};
+    _CCCL_TRY_CUDA_API(
+      ::cudaMallocFromPoolAsync,
+      "async_memory_resource::allocate_async failed to allocate with cudaMallocFromPoolAsync",
+      &__ptr,
+      __bytes,
+      __pool_,
+      __stream.get());
+    return __ptr;
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`
+  //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate_async` call that returned \p __ptr.
+  //! @param __stream A stream that has a stream ordering relationship with the stream used in the
+  //! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">allocate_async</a> call
+  //! that returned \p __ptr.
+  //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream.
+  //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`.
+  void deallocate_async(void* __ptr, const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream)
+  {
+    // We need to ensure that the provided alignment matches the minimal provided alignment
+    _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
+                       "Invalid alignment passed to async_memory_resource::deallocate.");
+    deallocate_async(__ptr, __bytes, __stream);
+    (void) __alignment;
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`.
+  //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr.
+  //! @param __stream A stream that has a stream ordering relationship with the stream used in the
+  //! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">allocate_async</a> call
+  //! that returned \p __ptr.
+  //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream.
+  //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`.
+  void deallocate_async(void* __ptr, size_t, const ::cuda::stream_ref __stream)
+  {
+    _CCCL_ASSERT_CUDA_API(::cudaFreeAsync, "async_memory_resource::deallocate_async failed", __ptr, __stream.get());
+  }
+
+  //! @brief Equality comparison with another async_memory_resource.
+  //! @returns true if underlying \c cudaMemPool_t are equal.
+  _CCCL_NODISCARD constexpr bool operator==(async_memory_resource const& __rhs) const noexcept
+  {
+    return __pool_ == __rhs.__pool_;
+  }
+#    if _CCCL_STD_VER <= 2017
+
+  //! @brief Inequality comparison with another \c async_memory_resource.
+  //! @returns true if underlying \c cudaMemPool_t are inequal.
+  _CCCL_NODISCARD constexpr bool operator!=(async_memory_resource const& __rhs) const noexcept
+  {
+    return __pool_ != __rhs.__pool_;
+  }
+#    endif // _CCCL_STD_VER <= 2017
+
+#    if _CCCL_STD_VER >= 2020
+  //! @brief Equality comparison between a \c async_memory_resource and another resource.
+  //! @param __rhs The resource to compare to.
+  //! @returns If the underlying types are equality comparable, returns the result of equality comparison of both
+  //! resources. Otherwise, returns false.
+  _LIBCUDACXX_TEMPLATE(class _Resource)
+  _LIBCUDACXX_REQUIRES((_CUDA_VMR::__different_resource<async_memory_resource, _Resource>) )
+  _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
+  {
+    return _CUDA_VMR::resource_ref<>{const_cast<async_memory_resource*>(this)}
+        == _CUDA_VMR::resource_ref<>{const_cast<_Resource&>(__rhs)};
+  }
+#    else // ^^^ C++20 ^^^ / vvv C++17
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(async_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<async_memory_resource, _Resource>)
+  {
+    return _CUDA_VMR::resource_ref<>{const_cast<async_memory_resource&>(__lhs)}
+        == _CUDA_VMR::resource_ref<>{const_cast<_Resource&>(__rhs)};
+  }
+
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, async_memory_resource const& __lhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<async_memory_resource, _Resource>)
+  {
+    return _CUDA_VMR::resource_ref<>{const_cast<async_memory_resource&>(__lhs)}
+        == _CUDA_VMR::resource_ref<>{const_cast<_Resource&>(__rhs)};
+  }
+
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator!=(async_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<async_memory_resource, _Resource>)
+  {
+    return _CUDA_VMR::resource_ref<>{const_cast<async_memory_resource&>(__lhs)}
+        != _CUDA_VMR::resource_ref<>{const_cast<_Resource&>(__rhs)};
+  }
+
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, async_memory_resource const& __lhs) noexcept
+    _LIBCUDACXX_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<async_memory_resource, _Resource>)
+  {
+    return _CUDA_VMR::resource_ref<>{const_cast<async_memory_resource&>(__lhs)}
+        != _CUDA_VMR::resource_ref<>{const_cast<_Resource&>(__rhs)};
+  }
+#    endif // _CCCL_STD_VER <= 2017
+
+  //! @brief Returns the underlying handle to the CUDA memory pool.
+  _CCCL_NODISCARD constexpr cudaMemPool_t get() const noexcept
+  {
+    return __pool_;
+  }
+
+#    ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen cannot handle the friend function
+  //! @brief Enables the \c device_accessible property for \c async_memory_resource.
+  //! @relates async_memory_resource
+  friend constexpr void get_property(async_memory_resource const&, _CUDA_VMR::device_accessible) noexcept {}
+#    endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+static_assert(_CUDA_VMR::resource_with<async_memory_resource, _CUDA_VMR::device_accessible>, "");
+
+} // namespace cuda::experimental::mr
+
+#  endif // _CCCL_STD_VER >= 2014
+
+#endif // !_CCCL_COMPILER_MSVC_2017 && !_CCCL_CUDACC_BELOW_11_2
+
+#endif //_CUDAX__MEMORY_RESOURCE_CUDA_ASYNC_MEMORY_RESOURCE
diff --git a/cudax/include/cuda/experimental/__stream/stream.cuh b/cudax/include/cuda/experimental/__stream/stream.cuh
index 27f0f698db..ba9073f4ea 100644
--- a/cudax/include/cuda/experimental/__stream/stream.cuh
+++ b/cudax/include/cuda/experimental/__stream/stream.cuh
@@ -23,22 +23,14 @@
 #endif // no system header
 
 #include <cuda/std/__cuda/api_wrapper.h>
-#include <cuda/stream_ref>
 
 #include <cuda/experimental/__device/device_ref.cuh>
-#include <cuda/experimental/__event/timed_event.cuh>
+#include <cuda/experimental/__stream/stream_ref.cuh>
 #include <cuda/experimental/__utility/ensure_current_device.cuh>
 
 namespace cuda::experimental
 {
 
-namespace detail
-{
-// 0 is a valid stream in CUDA, so we need some other invalid stream representation
-// Can't make it constexpr, because cudaStream_t is a pointer type
-static const ::cudaStream_t invalid_stream = reinterpret_cast<cudaStream_t>(~0ULL);
-} // namespace detail
-
 //! @brief An owning wrapper for cudaStream_t.
 struct stream : stream_ref
 {
@@ -51,6 +43,7 @@ struct stream : stream_ref
   //!
   //! @throws cuda_error if stream creation fails
   explicit stream(device_ref __dev, int __priority = default_priority)
+      : stream_ref(detail::__invalid_stream)
   {
     [[maybe_unused]] __ensure_current_device __dev_setter(__dev);
     _CCCL_TRY_CUDA_API(
@@ -67,9 +60,9 @@ struct stream : stream_ref
   //! @brief Construct a new `stream` object into the moved-from state.
   //!
   //! @post `stream()` returns an invalid stream handle
-  // Can't be constexpr because invalid_stream isn't
+  // Can't be constexpr because __invalid_stream isn't
   explicit stream(uninit_t) noexcept
-      : stream_ref(detail::invalid_stream)
+      : stream_ref(detail::__invalid_stream)
   {}
 
   //! @brief Move-construct a new `stream` object
@@ -78,7 +71,7 @@ struct stream : stream_ref
   //!
   //! @post `__other` is in moved-from state.
   stream(stream&& __other) noexcept
-      : stream(_CUDA_VSTD::exchange(__other.__stream, detail::invalid_stream))
+      : stream(_CUDA_VSTD::exchange(__other.__stream, detail::__invalid_stream))
   {}
 
   stream(const stream&) = delete;
@@ -88,7 +81,7 @@ struct stream : stream_ref
   //! @note If the stream fails to be destroyed, the error is silently ignored.
   ~stream()
   {
-    if (__stream != detail::invalid_stream)
+    if (__stream != detail::__invalid_stream)
     {
       // Needs to call driver API in case current device is not set, runtime version would set dev 0 current
       // Alternative would be to store the device and push/pop here
@@ -110,70 +103,6 @@ struct stream : stream_ref
 
   stream& operator=(const stream&) = delete;
 
-  // Ideally records and waits below would be in stream_ref, but we can't have it depend on cudax yet
-
-  //! @brief Create a new event and record it into this stream
-  //!
-  //! @return A new event that was recorded into this stream
-  //!
-  //! @throws cuda_error if event creation or record failed
-  _CCCL_NODISCARD event record_event(event::flags __flags = event::flags::none) const
-  {
-    return event(*this, __flags);
-  }
-
-  //! @brief Create a new timed event and record it into this stream
-  //!
-  //! @return A new timed event that was recorded into this stream
-  //!
-  //! @throws cuda_error if event creation or record failed
-  _CCCL_NODISCARD timed_event record_timed_event(event::flags __flags = event::flags::none) const
-  {
-    return timed_event(*this, __flags);
-  }
-
-  using stream_ref::wait;
-
-  //! @brief Make all future work submitted into this stream depend on completion of the specified event
-  //!
-  //! @param __ev Event that this stream should wait for
-  //!
-  //! @throws cuda_error if inserting the dependency fails
-  void wait(event_ref __ev) const
-  {
-    assert(__ev.get() != nullptr);
-    // Need to use driver API, cudaStreamWaitEvent would push dev 0 if stack was empty
-    detail::driver::streamWaitEvent(get(), __ev.get());
-  }
-
-  //! @brief Make all future work submitted into this stream depend on completion of all work from the specified
-  //! stream
-  //!
-  //! @param __other Stream that this stream should wait for
-  //!
-  //! @throws cuda_error if inserting the dependency fails
-  void wait(stream_ref __other) const
-  {
-    // TODO consider an optimization to not create an event every time and instead have one persistent event or one
-    // per stream
-    assert(__stream != detail::invalid_stream);
-    event __tmp(__other);
-    wait(__tmp);
-  }
-
-  //! @brief Get device under which this stream was created.
-  //!
-  //! @throws cuda_error if device check fails
-  device_ref device() const
-  {
-    // Because the stream can come from_native_handle, we can't just loop over devices comparing contexts,
-    // lower to CUDART for this instead
-    __ensure_current_device __dev_setter(*this);
-    int result;
-    _CCCL_TRY_CUDA_API(cudaGetDevice, "Could not get device from a stream", &result);
-    return result;
-  }
-
   //! @brief Construct an `stream` object from a native `cudaStream_t` handle.
   //!
   //! @param __handle The native handle
@@ -199,7 +128,7 @@ struct stream : stream_ref
   //! @post The stream object is in a moved-from state.
   _CCCL_NODISCARD ::cudaStream_t release()
   {
-    return _CUDA_VSTD::exchange(__stream, detail::invalid_stream);
+    return _CUDA_VSTD::exchange(__stream, detail::__invalid_stream);
   }
 
 private:
diff --git a/cudax/include/cuda/experimental/__stream/stream_ref.cuh b/cudax/include/cuda/experimental/__stream/stream_ref.cuh
new file mode 100644
index 0000000000..3a5bffdfaa
--- /dev/null
+++ b/cudax/include/cuda/experimental/__stream/stream_ref.cuh
@@ -0,0 +1,114 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__STREAM_STREAM_REF
+#define _CUDAX__STREAM_STREAM_REF
+
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda_runtime_api.h>
+
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/__device/device_ref.cuh>
+#include <cuda/experimental/__event/timed_event.cuh>
+#include <cuda/experimental/__utility/ensure_current_device.cuh>
+
+namespace cuda::experimental
+{
+
+namespace detail
+{
+// 0 is a valid stream in CUDA, so we need some other invalid stream representation
+// Can't make it constexpr, because cudaStream_t is a pointer type
+static const ::cudaStream_t __invalid_stream = reinterpret_cast<cudaStream_t>(~0ULL);
+} // namespace detail
+
+//! @brief A non-owning wrapper for cudaStream_t.
+struct stream_ref : ::cuda::stream_ref
+{
+  using ::cuda::stream_ref::stream_ref;
+
+  stream_ref() = delete;
+
+  //! @brief Create a new event and record it into this stream
+  //!
+  //! @return A new event that was recorded into this stream
+  //!
+  //! @throws cuda_error if event creation or record failed
+  _CCCL_NODISCARD event record_event(event::flags __flags = event::flags::none) const
+  {
+    return event(*this, __flags);
+  }
+
+  //! @brief Create a new timed event and record it into this stream
+  //!
+  //! @return A new timed event that was recorded into this stream
+  //!
+  //! @throws cuda_error if event creation or record failed
+  _CCCL_NODISCARD timed_event record_timed_event(event::flags __flags = event::flags::none) const
+  {
+    return timed_event(*this, __flags);
+  }
+
+  using ::cuda::stream_ref::wait;
+
+  //! @brief Make all future work submitted into this stream depend on completion of the specified event
+  //!
+  //! @param __ev Event that this stream should wait for
+  //!
+  //! @throws cuda_error if inserting the dependency fails
+  void wait(event_ref __ev) const
+  {
+    assert(__ev.get() != nullptr);
+    // Need to use driver API, cudaStreamWaitEvent would push dev 0 if stack was empty
+    detail::driver::streamWaitEvent(get(), __ev.get());
+  }
+
+  //! @brief Make all future work submitted into this stream depend on completion of all work from the specified
+  //! stream
+  //!
+  //! @param __other Stream that this stream should wait for
+  //!
+  //! @throws cuda_error if inserting the dependency fails
+  void wait(stream_ref __other) const
+  {
+    // TODO consider an optimization to not create an event every time and instead have one persistent event or one
+    // per stream
+    assert(__stream != detail::__invalid_stream);
+    event __tmp(__other);
+    wait(__tmp);
+  }
+
+  //! @brief Get device under which this stream was created.
+  //!
+  //! @throws cuda_error if device check fails
+  device_ref device() const
+  {
+    // Because the stream can come from_native_handle, we can't just loop over devices comparing contexts,
+    // lower to CUDART for this instead
+    __ensure_current_device __dev_setter(*this);
+    int result;
+    _CCCL_TRY_CUDA_API(cudaGetDevice, "Could not get device from a stream", &result);
+    return result;
+  }
+};
+
+} // namespace cuda::experimental
+
+#endif // _CUDAX__STREAM_STREAM_REF
diff --git a/cudax/include/cuda/experimental/buffer b/cudax/include/cuda/experimental/buffer.cuh
similarity index 89%
rename from cudax/include/cuda/experimental/buffer
rename to cudax/include/cuda/experimental/buffer.cuh
index 2d9005f446..93dc454d39 100644
--- a/cudax/include/cuda/experimental/buffer
+++ b/cudax/include/cuda/experimental/buffer.cuh
@@ -20,6 +20,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/experimental/__container/uninitialized_buffer.h>
+#include <cuda/experimental/__container/uninitialized_buffer.cuh>
 
-#endif //_CUDA_BUFFER
+#endif // __CUDAX_BUFFER
diff --git a/cudax/include/cuda/experimental/device.cuh b/cudax/include/cuda/experimental/device.cuh
index 264cb3cc1a..005dbf7340 100644
--- a/cudax/include/cuda/experimental/device.cuh
+++ b/cudax/include/cuda/experimental/device.cuh
@@ -12,6 +12,7 @@
 #define __CUDAX_DEVICE__
 
 #include <cuda/experimental/__device/all_devices.cuh>
+#include <cuda/experimental/__device/arch_traits.cuh>
 #include <cuda/experimental/__device/attributes.cuh>
 #include <cuda/experimental/__device/device.cuh>
 #include <cuda/experimental/__device/device_ref.cuh>
diff --git a/cudax/include/cuda/experimental/memory_resource.cuh b/cudax/include/cuda/experimental/memory_resource.cuh
new file mode 100644
index 0000000000..3ebce76451
--- /dev/null
+++ b/cudax/include/cuda/experimental/memory_resource.cuh
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_MEMORY_RESOURCE___
+#define __CUDAX_MEMORY_RESOURCE___
+
+#include <cuda/experimental/__memory_resource/any_resource.cuh>
+#include <cuda/experimental/__memory_resource/async_memory_pool.cuh>
+#include <cuda/experimental/__memory_resource/async_memory_resource.cuh>
+
+#endif // __CUDAX_MEMORY_RESOURCE___
diff --git a/cudax/lib/cmake/cudax/cudax-config-version.cmake b/cudax/lib/cmake/cudax/cudax-config-version.cmake
index 10f7c03a77..e7683ad9d8 100644
--- a/cudax/lib/cmake/cudax/cudax-config-version.cmake
+++ b/cudax/lib/cmake/cudax/cudax-config-version.cmake
@@ -1,5 +1,5 @@
 set(cudax_VERSION_MAJOR 2)
-set(cudax_VERSION_MINOR 6)
+set(cudax_VERSION_MINOR 7)
 set(cudax_VERSION_PATCH 0)
 set(cudax_VERSION_TWEAK 0)
 
diff --git a/cudax/samples/CMakeLists.txt b/cudax/samples/CMakeLists.txt
index df0985c1ad..df8a93423a 100755
--- a/cudax/samples/CMakeLists.txt
+++ b/cudax/samples/CMakeLists.txt
@@ -32,6 +32,7 @@ CPMAddPackage(
   GITHUB_REPOSITORY ${CCCL_REPOSITORY}
   GIT_TAG ${CCCL_TAG}
   GIT_SHALLOW ON
+  # The following is required to make the `CCCL::cudax` target available:
   OPTIONS "CCCL_ENABLE_UNSTABLE ON"
 )
 
@@ -40,36 +41,31 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
   set(CMAKE_CUDA_ARCHITECTURES 86)
 endif()
 
-# Creates a cmake executable target for the main program
-add_executable(vector_add vector_add/vector_add.cu)
+add_library(cudax_samples_interface INTERFACE)
 
-# "Links" the CCCL::cudax CMake target to the `vector_add` executable. This
-# configures everything needed to use CCCL's headers, including setting up
-# include paths, compiler flags, etc.
-target_link_libraries(vector_add
-  PUBLIC
-    CCCL::cudax
-    CCCL::CCCL
-    CCCL::Thrust
-    CCCL::libcudacxx
-  INTERFACE cudax.compiler_interface
+target_compile_definitions(
+  cudax_samples_interface INTERFACE
+  LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 )
 
-# TODO: These are temporary until the main branch catches up with the latest changes
-target_compile_definitions(vector_add PUBLIC LIBCUDACXX_ENABLE_EXCEPTIONS)
+target_link_libraries(cudax_samples_interface INTERFACE CCCL::CCCL CCCL::cudax)
 
 if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
   # mdspan on windows only works in C++20 mode
-  target_compile_features(vector_add PUBLIC cxx_std_20)
+  target_compile_features(cudax_samples_interface INTERFACE cxx_std_20)
 
-  # cudax requires dim3 to be usable from a constexpr context, and the CUDART headers require
-  # __cplusplus to be defined for this to work:
-  target_compile_options(vector_add PRIVATE
+  # cudax requires dim3 to be usable from a constexpr context, and the CUDART
+  # headers require __cplusplus to be defined for this to work:
+  target_compile_options(cudax_samples_interface INTERFACE
     $<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus /Zc:preprocessor>
     $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcompiler=/Zc:__cplusplus -Xcompiler=/Zc:preprocessor>
   )
 endif()
 
+# The vector_add sample demonstrates a simple CUDA kernel that adds two vectors
+add_executable(vector_add vector_add/vector_add.cu)
+target_link_libraries(vector_add PUBLIC cudax_samples_interface)
+
 # This is only relevant for internal testing and not needed by end users.
 include(CTest)
 enable_testing()
diff --git a/cudax/samples/vector_add/param_kind.cuh b/cudax/samples/vector_add/param_kind.cuh
deleted file mode 100644
index d50ebe49d3..0000000000
--- a/cudax/samples/vector_add/param_kind.cuh
+++ /dev/null
@@ -1,85 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of CUDA Experimental in CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _CUDAX__LAUNCH_PARAM_KIND
-#define _CUDAX__LAUNCH_PARAM_KIND
-
-#include <cuda/__cccl_config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__type_traits/maybe_const.h>
-
-#include <cuda/experimental/__detail/utility.cuh>
-
-namespace cuda::experimental
-{
-namespace detail
-{
-enum class __param_kind : unsigned
-{
-  _in    = 1,
-  _out   = 2,
-  _inout = 3
-};
-
-_CCCL_NODISCARD _CCCL_HOST_DEVICE inline constexpr __param_kind operator&(__param_kind __a, __param_kind __b) noexcept
-{
-  return __param_kind(unsigned(__a) & unsigned(__b));
-}
-
-template <typename _Ty, __param_kind _Kind>
-struct _CCCL_NODISCARD __box
-{
-  ::cuda::std::__maybe_const<_Kind == __param_kind::_in, _Ty>& __val;
-};
-
-struct __in_t
-{
-  template <class _Ty>
-  __box<_Ty, __param_kind::_in> operator()(const _Ty& __v) const noexcept
-  {
-    return {__v};
-  }
-};
-
-struct __out_t
-{
-  template <class _Ty>
-  __box<_Ty, __param_kind::_out> operator()(_Ty& __v) const noexcept
-  {
-    return {__v};
-  }
-};
-
-struct __inout_t
-{
-  template <class _Ty>
-  __box<_Ty, __param_kind::_inout> operator()(_Ty& __v) const noexcept
-  {
-    return {__v};
-  }
-};
-
-} // namespace detail
-
-_CCCL_GLOBAL_CONSTANT detail::__in_t in{};
-_CCCL_GLOBAL_CONSTANT detail::__out_t out{};
-_CCCL_GLOBAL_CONSTANT detail::__inout_t inout{};
-
-} // namespace cuda::experimental
-
-#endif // _CUDAX__LAUNCH_PARAM_KIND
diff --git a/cudax/samples/vector_add/vector.cuh b/cudax/samples/vector_add/vector.cuh
index 7eef87f038..01ce08787c 100644
--- a/cudax/samples/vector_add/vector.cuh
+++ b/cudax/samples/vector_add/vector.cuh
@@ -29,8 +29,7 @@
 #include <cuda/stream_ref>
 
 #include <cuda/experimental/__detail/utility.cuh>
-
-#include "param_kind.cuh"
+#include <cuda/experimental/__launch/param_kind.cuh>
 
 #if _CCCL_STD_VER >= 2017
 namespace cuda::experimental
@@ -60,7 +59,7 @@ public:
   }
 
 private:
-  void sync_host_to_device(stream_ref __str, detail::__param_kind __p) const
+  void sync_host_to_device(::cuda::stream_ref __str, detail::__param_kind __p) const
   {
     if (__dirty_)
     {
@@ -79,7 +78,7 @@ private:
     }
   }
 
-  void sync_device_to_host(stream_ref __str, detail::__param_kind __p) const
+  void sync_device_to_host(::cuda::stream_ref __str, detail::__param_kind __p) const
   {
     if (__p != detail::__param_kind::_in)
     {
@@ -95,7 +94,7 @@ private:
     using __cv_vector = ::cuda::std::__maybe_const<_Kind == detail::__param_kind::_in, vector>;
 
   public:
-    explicit __action(stream_ref __str, __cv_vector& __v) noexcept
+    explicit __action(::cuda::stream_ref __str, __cv_vector& __v) noexcept
         : __str_(__str)
         , __v_(__v)
     {
@@ -117,25 +116,25 @@ private:
     }
 
   private:
-    stream_ref __str_;
+    ::cuda::stream_ref __str_;
     __cv_vector& __v_;
   };
 
   _CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_inout>
-  __cudax_launch_transform(stream_ref __str, vector& __v) noexcept
+  __cudax_launch_transform(::cuda::stream_ref __str, vector& __v) noexcept
   {
     return __action<detail::__param_kind::_inout>{__str, __v};
   }
 
   _CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_in>
-  __cudax_launch_transform(stream_ref __str, const vector& __v) noexcept
+  __cudax_launch_transform(::cuda::stream_ref __str, const vector& __v) noexcept
   {
     return __action<detail::__param_kind::_in>{__str, __v};
   }
 
   template <detail::__param_kind _Kind>
   _CCCL_NODISCARD_FRIEND __action<_Kind>
-  __cudax_launch_transform(stream_ref __str, detail::__box<vector, _Kind> __b) noexcept
+  __cudax_launch_transform(::cuda::stream_ref __str, detail::__box<vector, _Kind> __b) noexcept
   {
     return __action<_Kind>{__str, __b.__val};
   }
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index cda6623668..d4ec714e07 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -30,7 +30,6 @@ function(cudax_add_catch2_test target_name_var test_name cn_target) # ARGN=test
   target_link_libraries(${test_target} PRIVATE ${cn_target} Catch2::Catch2 catch2_main)
   target_link_libraries(${test_target} PRIVATE ${cn_target} cudax::Thrust)
   target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE")
-  target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXCEPTIONS")
   target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
   cudax_clone_target_properties(${test_target} ${cn_target})
   set_target_properties(${test_target} PROPERTIES
@@ -67,8 +66,10 @@ foreach(cn_target IN LISTS cudax_TARGETS)
 
   cudax_add_catch2_test(test_target device_tests ${cn_target}
     device/device_smoke.cu
+    device/arch_traits.cu
   )
   target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>)
 
   cudax_add_catch2_test(test_target event_tests ${cn_target}
     event/event_smoke.cu
@@ -88,4 +89,21 @@ foreach(cn_target IN LISTS cudax_TARGETS)
   cudax_add_catch2_test(test_target containers ${cn_target}
     containers/uninitialized_buffer.cu
   )
+
+  cudax_add_catch2_test(test_target memory_resource ${cn_target}
+    memory_resource/any_resource.cu
+    memory_resource/async_memory_pool.cu
+    memory_resource/async_memory_resource.cu
+  )
+
+  cudax_add_catch2_test(test_target async_tests ${cn_target}
+    async/test_conditional.cu
+    async/test_continue_on.cu
+    async/test_just.cu
+    async/test_sequence.cu
+    async/test_when_all.cu
+  )
+  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>)
+
 endforeach()
diff --git a/cudax/test/async/common/checked_receiver.cuh b/cudax/test/async/common/checked_receiver.cuh
new file mode 100755
index 0000000000..25eb1d3ae2
--- /dev/null
+++ b/cudax/test/async/common/checked_receiver.cuh
@@ -0,0 +1,125 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "cuda/experimental/__async/async.cuh"
+#include "testing.cuh"
+
+namespace
+{
+template <class... Values>
+struct checked_value_receiver
+{
+  using receiver_concept = cudax_async::receiver_t;
+
+  checked_value_receiver(Values... values)
+      : _values{{values}...}
+  {}
+
+  // This overload is needed to avoid an nvcc compiler bug where a variadic
+  // pack is not visible within the scope of a lambda.
+  _CCCL_HOST_DEVICE void set_value() && noexcept
+  {
+    if constexpr (!_CUDA_VSTD::is_same_v<cudax_async::__mlist<Values...>, cudax_async::__mlist<>>)
+    {
+      CUDAX_FAIL("expected a value completion; got a different value");
+    }
+  }
+
+  template <class... As>
+  _CCCL_HOST_DEVICE void set_value(As... as) && noexcept
+  {
+    if constexpr (_CUDA_VSTD::is_same_v<cudax_async::__mlist<Values...>, cudax_async::__mlist<As...>>)
+    {
+      _values.__apply(
+        [&](auto const&... vs) {
+          CUDAX_CHECK(((vs == as) && ...));
+        },
+        _values);
+    }
+    else
+    {
+      CUDAX_FAIL("expected a value completion; got a different value");
+    }
+  }
+
+  template <class Error>
+  _CCCL_HOST_DEVICE void set_error(Error) && noexcept
+  {
+    CUDAX_FAIL("expected a value completion; got an error");
+  }
+
+  _CCCL_HOST_DEVICE void set_stopped() && noexcept
+  {
+    CUDAX_FAIL("expected a value completion; got stopped");
+  }
+
+  cudax_async::__tuple<Values...> _values;
+};
+
+template <class... Values>
+checked_value_receiver(Values...) -> checked_value_receiver<Values...>;
+
+template <class Error>
+struct checked_error_receiver
+{
+  using receiver_concept = cudax_async::receiver_t;
+
+  template <class... As>
+  _CCCL_HOST_DEVICE void set_value(As...) && noexcept
+  {
+    CUDAX_FAIL("expected an error completion; got a value");
+  }
+
+  template <class Ty>
+  _CCCL_HOST_DEVICE void set_error(Ty ty) && noexcept
+  {
+    if constexpr (_CUDA_VSTD::is_same_v<Error, Ty>)
+    {
+      CUDAX_CHECK(ty == _error);
+    }
+    else
+    {
+      CUDAX_FAIL("expected an error completion; got a different error");
+    }
+  }
+
+  _CCCL_HOST_DEVICE void set_stopped() && noexcept
+  {
+    CUDAX_FAIL("expected a value completion; got stopped");
+  }
+
+  Error _error;
+};
+
+template <class Error>
+checked_error_receiver(Error) -> checked_error_receiver<Error>;
+
+struct checked_stopped_receiver
+{
+  using receiver_concept = cudax_async::receiver_t;
+
+  template <class... As>
+  _CCCL_HOST_DEVICE void set_value(As...) && noexcept
+  {
+    CUDAX_FAIL("expected a stopped completion; got a value");
+  }
+
+  template <class Ty>
+  _CCCL_HOST_DEVICE void set_error(Ty) && noexcept
+  {
+    CUDAX_FAIL("expected an stopped completion; got an error");
+  }
+
+  _CCCL_HOST_DEVICE void set_stopped() && noexcept {}
+};
+
+} // namespace
diff --git a/cudax/test/async/common/error_scheduler.cuh b/cudax/test/async/common/error_scheduler.cuh
new file mode 100755
index 0000000000..8df7f08576
--- /dev/null
+++ b/cudax/test/async/common/error_scheduler.cuh
@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "cuda/experimental/__async/async.cuh"
+
+namespace
+{
+//! Scheduler that returns a sender that always completes with error.
+template <class Error>
+struct error_scheduler
+{
+private:
+  struct env_t
+  {
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_value_t>) const noexcept
+    {
+      return error_scheduler{};
+    }
+
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_stopped_t>) const noexcept
+    {
+      return error_scheduler{};
+    }
+  };
+
+  template <class Rcvr>
+  struct opstate_t : cudax_async::__immovable
+  {
+    using operation_state_concept = cudax_async::operation_state_t;
+    using completion_signatures   = //
+      cudax_async::completion_signatures< //
+        cudax_async::set_value_t(), //
+        cudax_async::set_error_t(Error),
+        cudax_async::set_stopped_t()>;
+
+    Rcvr _rcvr;
+    Error _err;
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      cudax_async::set_error(static_cast<Rcvr&&>(_rcvr), static_cast<Error&&>(_err));
+    }
+  };
+
+  struct sndr_t
+  {
+    using sender_concept        = cudax_async::sender_t;
+    using completion_signatures = //
+      cudax_async::completion_signatures< //
+        cudax_async::set_value_t(), //
+        cudax_async::set_error_t(Error),
+        cudax_async::set_stopped_t()>;
+
+    template <class Rcvr>
+    _CCCL_HOST_DEVICE opstate_t<Rcvr> connect(Rcvr rcvr) const
+    {
+      return {{}, static_cast<Rcvr&&>(rcvr), _err};
+    }
+
+    _CCCL_HOST_DEVICE env_t get_env() const noexcept
+    {
+      return {};
+    }
+
+    Error _err;
+  };
+
+  _CCCL_HOST_DEVICE friend bool operator==(error_scheduler, error_scheduler) noexcept
+  {
+    return true;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(error_scheduler, error_scheduler) noexcept
+  {
+    return false;
+  }
+
+  Error _err{};
+
+public:
+  using scheduler_concept = cudax_async::scheduler_t;
+
+  _CCCL_HOST_DEVICE explicit error_scheduler(Error err)
+      : _err(static_cast<Error&&>(err))
+  {}
+
+  _CCCL_HOST_DEVICE sndr_t schedule() const noexcept
+  {
+    return {_err};
+  }
+};
+} // namespace
diff --git a/cudax/test/async/common/impulse_scheduler.cuh b/cudax/test/async/common/impulse_scheduler.cuh
new file mode 100755
index 0000000000..3517e5ac29
--- /dev/null
+++ b/cudax/test/async/common/impulse_scheduler.cuh
@@ -0,0 +1,200 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <memory>
+#include <mutex>
+
+#include "cuda/experimental/__async/async.cuh"
+
+#if !defined(__CUDA_ARCH__)
+
+namespace
+{
+//! Scheduler that will send impulses on user's request.
+//! One can obtain senders from this, connect them to receivers and start the operation states.
+//! Until the scheduler is told to start the next operation, the actions in the operation states are
+//! not executed. This is similar to a task scheduler, but it's single threaded. It has basic
+//! thread-safety to allow it to be run with `sync_wait` (which makes us not control when the
+//! operation_state object is created and started).
+struct impulse_scheduler
+{
+private:
+  //! Command type that can store the action of firing up a sender
+  using cmd_t     = std::function<void()>;
+  using cmd_vec_t = std::vector<cmd_t>;
+
+  struct data_t : std::enable_shared_from_this<data_t>
+  {
+    explicit data_t(int id)
+        : id_(id)
+    {}
+
+    int id_;
+    std::mutex mutex_;
+    std::condition_variable cv_;
+    std::vector<std::function<void()>> all_commands_;
+  };
+
+  //! That data_t shared between the operation state and the actual scheduler
+  //! Shared pointer to allow the scheduler to be copied (not the best semantics, but it will do)
+  std::shared_ptr<data_t> data_{};
+
+  template <class Rcvr>
+  struct opstate_t : cudax_async::__immovable
+  {
+    using operation_state_concept = cudax_async::operation_state_t;
+    using completion_signatures =
+      cudax_async::completion_signatures<cudax_async::set_value_t(), cudax_async::set_stopped_t()>;
+
+    data_t* data_;
+    Rcvr rcvr_;
+
+    opstate_t(data_t* data, Rcvr&& rcvr)
+        : data_(data)
+        , rcvr_(static_cast<Rcvr&&>(rcvr))
+    {}
+
+    void start() & noexcept
+    {
+      // Enqueue another command to the list of all commands
+      // The scheduler will start this, whenever start_next() is called
+      std::unique_lock lock{data_->mutex_};
+      data_->all_commands_.emplace_back([this]() {
+        if (cudax_async::get_stop_token(cudax_async::get_env(rcvr_)).stop_requested())
+        {
+          cudax_async::set_stopped(static_cast<Rcvr&&>(rcvr_));
+        }
+        else
+        {
+          cudax_async::set_value(static_cast<Rcvr&&>(rcvr_));
+        }
+      });
+      data_->cv_.notify_all();
+    }
+  };
+
+  struct env_t
+  {
+    data_t* data_;
+
+    impulse_scheduler query(cudax_async::get_completion_scheduler_t<cudax_async::set_value_t>) const noexcept
+    {
+      return impulse_scheduler{data_};
+    }
+
+    impulse_scheduler query(cudax_async::get_completion_scheduler_t<cudax_async::set_stopped_t>) const noexcept
+    {
+      return impulse_scheduler{data_};
+    }
+  };
+
+  struct sndr_t
+  {
+    using sender_concept = cudax_async::sender_t;
+    using completion_signatures =
+      cudax_async::completion_signatures<cudax_async::set_value_t(), cudax_async::set_stopped_t()>;
+
+    data_t* data_;
+
+    template <class Rcvr>
+    opstate_t<Rcvr> connect(Rcvr rcvr)
+    {
+      return {data_, static_cast<Rcvr&&>(rcvr)};
+    }
+
+    auto get_env() const noexcept
+    {
+      return env_t{data_};
+    }
+  };
+
+  explicit impulse_scheduler(data_t* data)
+      : data_(data->shared_from_this())
+  {}
+
+public:
+  using scheduler_concept = cudax_async::scheduler_t;
+
+  impulse_scheduler()
+      : data_(std::make_shared<data_t>(0))
+  {}
+
+  explicit impulse_scheduler(int id)
+      : data_(std::make_shared<data_t>(id))
+  {}
+
+  ~impulse_scheduler() = default;
+
+  //! Actually start the command from the last started operation_state
+  //! Returns immediately if no command registered (i.e., no operation state started)
+  bool try_start_next()
+  {
+    // Wait for a command that we can execute
+    std::unique_lock lock{data_->mutex_};
+
+    // If there are no commands in the queue, return false
+    if (data_->all_commands_.empty())
+    {
+      return false;
+    }
+
+    // Pop one command from the queue
+    auto cmd = std::move(data_->all_commands_.front());
+    data_->all_commands_.erase(data_->all_commands_.begin());
+    // Exit the lock before executing the command
+    lock.unlock();
+    // Execute the command, i.e., send an impulse to the connected sender
+    cmd();
+    // Return true to signal that we started a command
+    return true;
+  }
+
+  //! Actually start the command from the last started operation_state
+  //! Blocks if no command registered (i.e., no operation state started)
+  void start_next()
+  {
+    // Wait for a command that we can execute
+    std::unique_lock lock{data_->mutex_};
+    while (data_->all_commands_.empty())
+    {
+      data_->cv_.wait(lock);
+    }
+
+    // Pop one command from the queue
+    auto cmd = std::move(data_->all_commands_.front());
+    data_->all_commands_.erase(data_->all_commands_.begin());
+    // Exit the lock before executing the command
+    lock.unlock();
+    // Execute the command, i.e., send an impulse to the connected sender
+    cmd();
+  }
+
+  sndr_t schedule() const noexcept
+  {
+    return sndr_t{data_.get()};
+  }
+
+  friend bool operator==(impulse_scheduler, impulse_scheduler) noexcept
+  {
+    return true;
+  }
+
+  friend bool operator!=(impulse_scheduler, impulse_scheduler) noexcept
+  {
+    return false;
+  }
+};
+} // namespace
+
+#endif
diff --git a/cudax/test/async/common/inline_scheduler.cuh b/cudax/test/async/common/inline_scheduler.cuh
new file mode 100755
index 0000000000..1400397b93
--- /dev/null
+++ b/cudax/test/async/common/inline_scheduler.cuh
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "cuda/experimental/__async/async.cuh"
+
+namespace
+{
+//! Scheduler that returns a sender that always completes inline
+//! (successfully).
+struct inline_scheduler
+{
+private:
+  struct env_t
+  {
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_value_t>) const noexcept
+    {
+      return inline_scheduler{};
+    }
+  };
+
+  template <class Rcvr>
+  struct opstate_t : cudax_async::__immovable
+  {
+    using operation_state_concept = cudax_async::operation_state_t;
+    using completion_signatures   = cudax_async::completion_signatures<cudax_async::set_value_t()>;
+
+    Rcvr _rcvr;
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      cudax_async::set_value(static_cast<Rcvr&&>(_rcvr));
+    }
+  };
+
+  struct sndr_t
+  {
+    using sender_concept        = cudax_async::sender_t;
+    using completion_signatures = cudax_async::completion_signatures<cudax_async::set_value_t()>;
+
+    template <class Rcvr>
+    _CCCL_HOST_DEVICE opstate_t<Rcvr> connect(Rcvr rcvr) const
+    {
+      return {{}, static_cast<Rcvr&&>(rcvr)};
+    }
+
+    _CCCL_HOST_DEVICE env_t get_env() const noexcept
+    {
+      return {};
+    }
+  };
+
+  _CCCL_HOST_DEVICE friend bool operator==(inline_scheduler, inline_scheduler) noexcept
+  {
+    return true;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(inline_scheduler, inline_scheduler) noexcept
+  {
+    return false;
+  }
+
+public:
+  using scheduler_concept = cudax_async::scheduler_t;
+
+  inline_scheduler() = default;
+
+  _CCCL_HOST_DEVICE sndr_t schedule() const noexcept
+  {
+    return {};
+  }
+};
+} // namespace
diff --git a/cudax/test/async/common/stopped_scheduler.cuh b/cudax/test/async/common/stopped_scheduler.cuh
new file mode 100755
index 0000000000..006d7a0652
--- /dev/null
+++ b/cudax/test/async/common/stopped_scheduler.cuh
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+namespace
+{
+//! Scheduler that returns a sender that always completes with stopped.
+struct stopped_scheduler
+{
+private:
+  struct env_t
+  {
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_value_t>) const noexcept
+    {
+      return stopped_scheduler{};
+    }
+
+    _CCCL_HOST_DEVICE auto query(cudax_async::get_completion_scheduler_t<cudax_async::set_stopped_t>) const noexcept
+    {
+      return stopped_scheduler{};
+    }
+  };
+
+  template <class Rcvr>
+  struct opstate_t : cudax_async::__immovable
+  {
+    using operation_state_concept = cudax_async::operation_state_t;
+    using completion_signatures   = //
+      cudax_async::completion_signatures<cudax_async::set_value_t(), cudax_async::set_stopped_t()>;
+
+    Rcvr _rcvr;
+
+    _CCCL_HOST_DEVICE void start() noexcept
+    {
+      cudax_async::set_stopped(static_cast<Rcvr&&>(_rcvr));
+    }
+  };
+
+  struct sndr_t
+  {
+    using sender_concept        = cudax_async::sender_t;
+    using completion_signatures = //
+      cudax_async::completion_signatures<cudax_async::set_value_t(), cudax_async::set_stopped_t()>;
+
+    template <class Rcvr>
+    _CCCL_HOST_DEVICE opstate_t<Rcvr> connect(Rcvr rcvr) const
+    {
+      return {{}, static_cast<Rcvr&&>(rcvr)};
+    }
+
+    _CCCL_HOST_DEVICE env_t get_env() const noexcept
+    {
+      return {};
+    }
+  };
+
+  _CCCL_HOST_DEVICE friend bool operator==(stopped_scheduler, stopped_scheduler) noexcept
+  {
+    return true;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(stopped_scheduler, stopped_scheduler) noexcept
+  {
+    return false;
+  }
+
+public:
+  using scheduler_concept = cudax_async::scheduler_t;
+
+  stopped_scheduler() = default;
+
+  _CCCL_HOST_DEVICE sndr_t schedule() const noexcept
+  {
+    return {};
+  }
+};
+} // namespace
diff --git a/cudax/test/async/common/utility.cuh b/cudax/test/async/common/utility.cuh
new file mode 100755
index 0000000000..f0867d36f7
--- /dev/null
+++ b/cudax/test/async/common/utility.cuh
@@ -0,0 +1,189 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "cuda/experimental/__async/async.cuh"
+#include "testing.cuh"
+
+//! A move-only type
+struct movable
+{
+  _CCCL_HOST_DEVICE movable(int value)
+      : value_(value)
+  {}
+
+  movable(movable&&) = default;
+
+  _CCCL_HOST_DEVICE friend bool operator==(const movable& a, const movable& b) noexcept
+  {
+    return a.value_ == b.value_;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const movable& a, const movable& b) noexcept
+  {
+    return a.value_ != b.value_;
+  }
+
+  _CCCL_HOST_DEVICE int value()
+  {
+    return value_;
+  } // silence warning of unused private field
+
+private:
+  int value_;
+};
+
+//! A type with potentially throwing move/copy constructors
+struct potentially_throwing
+{
+  potentially_throwing() = default;
+
+  _CCCL_HOST_DEVICE potentially_throwing(potentially_throwing&&) noexcept(false) {}
+
+  _CCCL_HOST_DEVICE potentially_throwing(const potentially_throwing&) noexcept(false) {}
+
+  _CCCL_HOST_DEVICE potentially_throwing& operator=(potentially_throwing&&) noexcept(false)
+  {
+    return *this;
+  }
+
+  _CCCL_HOST_DEVICE potentially_throwing& operator=(const potentially_throwing&) noexcept(false)
+  {
+    return *this;
+  }
+};
+
+struct string
+{
+  string() = default;
+
+  _CCCL_HOST_DEVICE explicit string(char const* c)
+  {
+    std::size_t len = 0;
+    while (c[len++])
+      ;
+    char* tmp = str = new char[len];
+    while (*tmp++ = *c++)
+      ;
+  }
+
+  _CCCL_HOST_DEVICE string(string&& other) noexcept
+      : str(other.str)
+  {
+    other.str = nullptr;
+  }
+
+  _CCCL_HOST_DEVICE string(const string& other)
+      : string(string(other.str))
+  {}
+
+  _CCCL_HOST_DEVICE ~string()
+  {
+    delete[] str;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator==(const string& left, const string& right) noexcept
+  {
+    char const* l = left.str;
+    char const* r = right.str;
+    while (*l && *r)
+    {
+      if (*l++ != *r++)
+      {
+        return false;
+      }
+    }
+    return *l == *r;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const string& left, const string& right) noexcept
+  {
+    return !(left == right);
+  }
+
+private:
+  char* str{};
+};
+
+struct error_code
+{
+  _CCCL_HOST_DEVICE friend bool operator==(const error_code& left, const error_code& right) noexcept
+  {
+    return left.ec == right.ec;
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator!=(const error_code& left, const error_code& right) noexcept
+  {
+    return !(left == right);
+  }
+
+  std::errc ec;
+};
+
+// run_loop isn't supported on-device yet, so neither can sync_wait be.
+#if !defined(__CUDA_ARCH__)
+
+template <class Sndr, class... Values>
+void check_values(Sndr&& sndr, const Values&... values) noexcept
+{
+  try
+  {
+    auto opt = cudax_async::sync_wait(static_cast<Sndr&&>(sndr));
+    if (!opt)
+    {
+      CUDAX_FAIL("Expected value completion; got stopped instead.");
+    }
+    else
+    {
+      auto&& vals = *opt;
+      CUDAX_CHECK(vals == ::cuda::std::tie(values...));
+    }
+  }
+  catch (...)
+  {
+    CUDAX_FAIL("Expected value completion; got error instead.");
+  }
+}
+
+#else // !defined(__CUDA_ARCH__)
+
+template <class Sndr, class... Values>
+void check_values(Sndr&& sndr, const Values&... values) noexcept
+{}
+
+#endif // !defined(__CUDA_ARCH__)
+
+template <class... Ts>
+using types = cudax_async::__mlist<Ts...>;
+
+template <class... Values, class Sndr>
+_CCCL_HOST_DEVICE void check_value_types(Sndr&&) noexcept
+{
+  using actual_t   = cudax_async::value_types_of_t<Sndr, cudax_async::env<>, types, cudax_async::__mmake_set>;
+  using expected_t = cudax_async::__mmake_set<Values...>;
+
+  static_assert(cudax_async::__mset_eq<expected_t, actual_t>, "value_types_of_t does not match expected types");
+}
+
+template <class... Errors, class Sndr>
+_CCCL_HOST_DEVICE void check_error_types(Sndr&&) noexcept
+{
+  using actual_t   = cudax_async::error_types_of_t<Sndr, cudax_async::env<>, cudax_async::__mmake_set>;
+  using expected_t = cudax_async::__mmake_set<Errors...>;
+
+  static_assert(cudax_async::__mset_eq<expected_t, actual_t>, "error_types_of_t does not match expected types");
+}
+
+template <bool SendsStopped, class Sndr>
+_CCCL_HOST_DEVICE void check_sends_stopped(Sndr&&) noexcept
+{
+  static_assert(cudax_async::sends_stopped<Sndr> == SendsStopped, "sends_stopped does not match expected value");
+}
diff --git a/cudax/test/async/test_conditional.cu b/cudax/test/async/test_conditional.cu
new file mode 100755
index 0000000000..702d97f94b
--- /dev/null
+++ b/cudax/test/async/test_conditional.cu
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License Version 2.0 with LLVM Exceptions
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   https://llvm.org/LICENSE.txt
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Include this first
+#include <cuda/experimental/__async/async.cuh>
+
+// Then include the test helpers
+#include "common/checked_receiver.cuh"
+#include "common/utility.cuh"
+#include "testing.cuh"
+
+namespace
+{
+TEST_CASE("simple use of conditional runs exactly one of the two closures", "[adaptors][conditional]")
+{
+  for (int i = 42; i < 44; ++i)
+  {
+    bool even{false};
+    bool odd{false};
+
+    auto sndr1 =
+      cudax_async::just(i)
+      | cudax_async::conditional(
+        [](int i) {
+          return i % 2 == 0;
+        },
+        cudax_async::then([&](int) {
+          even = true;
+        }),
+        cudax_async::then([&](int) {
+          odd = true;
+        }));
+
+    check_value_types<types<>>(sndr1);
+    check_sends_stopped<false>(sndr1);
+    NV_IF_ELSE_TARGET(NV_IS_HOST, //
+                      ({ check_error_types<std::exception_ptr>(sndr1); }),
+                      ({ check_error_types<>(sndr1); }));
+
+    auto op = cudax_async::connect(std::move(sndr1), checked_value_receiver<>{});
+    cudax_async::start(op);
+
+    CUDAX_CHECK(even == (i % 2 == 0));
+    CUDAX_CHECK(odd == (i % 2 == 1));
+  }
+}
+
+} // namespace
diff --git a/cudax/test/async/test_continue_on.cu b/cudax/test/async/test_continue_on.cu
new file mode 100755
index 0000000000..ed6c61631d
--- /dev/null
+++ b/cudax/test/async/test_continue_on.cu
@@ -0,0 +1,220 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include "cuda/experimental/__async/async.cuh"
+
+//
+#include "common/checked_receiver.cuh"
+#include "common/error_scheduler.cuh"
+#include "common/impulse_scheduler.cuh"
+#include "common/inline_scheduler.cuh"
+#include "common/stopped_scheduler.cuh"
+#include "common/utility.cuh"
+#include "testing.cuh"
+
+namespace
+{
+TEST_CASE("continue_on simple example", "[adaptors][continue_on]")
+{
+  auto snd = cudax_async::continue_on(cudax_async::just(13), inline_scheduler{});
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+  // The receiver checks if we receive the right value
+}
+
+#if !defined(__CUDA_ARCH__)
+
+TEST_CASE("continue_on can be piped", "[adaptors][continue_on]")
+{
+  // Just continue_on a value to the impulse scheduler
+  bool called{false};
+  auto sched = impulse_scheduler{};
+  auto snd   = cudax_async::just(13) //
+           | cudax_async::continue_on(sched) //
+           | cudax_async::then([&](int val) {
+               called = true;
+               return val;
+             });
+  // Start the operation
+  auto op = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+
+  // The value will be available when the scheduler will execute the next operation
+  CUDAX_REQUIRE(!called);
+  sched.start_next();
+  CUDAX_REQUIRE(called);
+}
+
+TEST_CASE("continue_on calls the receiver when the scheduler dictates", "[adaptors][continue_on]")
+{
+  bool called{false};
+  impulse_scheduler sched;
+  auto snd = cudax_async::then(cudax_async::continue_on(cudax_async::just(13), sched), [&](int val) {
+    called = true;
+    return val;
+  });
+  auto op  = cudax_async::connect(snd, checked_value_receiver{13});
+  cudax_async::start(op);
+  // Up until this point, the scheduler didn't start any task; no effect expected
+  CUDAX_CHECK(!called);
+
+  // Tell the scheduler to start executing one task
+  sched.start_next();
+  CUDAX_CHECK(called);
+}
+
+TEST_CASE("continue_on calls the given sender when the scheduler dictates", "[adaptors][continue_on]")
+{
+  int counter{0};
+  auto snd_base = cudax_async::just() //
+                | cudax_async::then([&]() -> int {
+                    ++counter;
+                    return 19;
+                  });
+
+  impulse_scheduler sched;
+  auto snd = cudax_async::then(cudax_async::continue_on(std::move(snd_base), sched), [&](int val) {
+    ++counter;
+    return val;
+  });
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{19});
+  cudax_async::start(op);
+  // The sender is started, even if the scheduler hasn't yet triggered
+  CUDAX_CHECK(counter == 1);
+  // ... but didn't send the value to the receiver yet
+
+  // Tell the scheduler to start executing one task
+  sched.start_next();
+
+  // Now the base sender is called, and a value is sent to the receiver
+  CUDAX_CHECK(counter == 2);
+}
+
+TEST_CASE("continue_on works when changing threads", "[adaptors][continue_on]")
+{
+  cudax_async::thread_context thread;
+  bool called{false};
+
+  {
+    // lunch some work on the thread pool
+    auto snd = cudax_async::continue_on(cudax_async::just(), thread.get_scheduler()) //
+             | cudax_async::then([&] {
+                 called = true;
+               });
+    cudax_async::start_detached(std::move(snd));
+  }
+
+  thread.join();
+
+  // the work should be executed
+  CUDAX_REQUIRE(called);
+}
+
+#endif
+
+TEST_CASE("continue_on can be called with rvalue ref scheduler", "[adaptors][continue_on]")
+{
+  auto snd = cudax_async::continue_on(cudax_async::just(13), inline_scheduler{});
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+  // The receiver checks if we receive the right value
+}
+
+TEST_CASE("continue_on can be called with const ref scheduler", "[adaptors][continue_on]")
+{
+  const inline_scheduler sched;
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+  // The receiver checks if we receive the right value
+}
+
+TEST_CASE("continue_on can be called with ref scheduler", "[adaptors][continue_on]")
+{
+  inline_scheduler sched;
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_value_receiver{13});
+  cudax_async::start(op);
+  // The receiver checks if we receive the right value
+}
+
+TEST_CASE("continue_on forwards set_error calls", "[adaptors][continue_on]")
+{
+  auto ec = error_code{std::errc::invalid_argument};
+  error_scheduler<error_code> sched{ec};
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_error_receiver{ec});
+  cudax_async::start(op);
+  // The receiver checks if we receive an error
+}
+
+TEST_CASE("continue_on forwards set_error calls of other types", "[adaptors][continue_on]")
+{
+  error_scheduler<string> sched{string{"error"}};
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_error_receiver{string{"error"}});
+  cudax_async::start(op);
+  // The receiver checks if we receive an error
+}
+
+TEST_CASE("continue_on forwards set_stopped calls", "[adaptors][continue_on]")
+{
+  stopped_scheduler sched{};
+  auto snd = cudax_async::continue_on(cudax_async::just(13), sched);
+  auto op  = cudax_async::connect(std::move(snd), checked_stopped_receiver{});
+  cudax_async::start(op);
+  // The receiver checks if we receive the stopped signal
+}
+
+TEST_CASE("continue_on has the values_type corresponding to the given values", "[adaptors][continue_on]")
+{
+  inline_scheduler sched{};
+
+  check_value_types<types<int>>(cudax_async::continue_on(cudax_async::just(1), sched));
+  check_value_types<types<int, double>>(cudax_async::continue_on(cudax_async::just(3, 0.14), sched));
+  check_value_types<types<int, double, string>>(
+    cudax_async::continue_on(cudax_async::just(3, 0.14, string{"pi"}), sched));
+}
+
+TEST_CASE("continue_on keeps error_types from scheduler's sender", "[adaptors][continue_on]")
+{
+  inline_scheduler sched1{};
+  error_scheduler<std::error_code> sched2{std::make_error_code(std::errc::invalid_argument)};
+  error_scheduler<int> sched3{43};
+
+  check_error_types<>(cudax_async::continue_on(cudax_async::just(1), sched1));
+  check_error_types<std::error_code>(cudax_async::continue_on(cudax_async::just(2), sched2));
+  check_error_types<int>(cudax_async::continue_on(cudax_async::just(3), sched3));
+}
+
+TEST_CASE("continue_on sends an exception_ptr if value types are potentially throwing when copied",
+          "[adaptors][continue_on]")
+{
+  inline_scheduler sched{};
+
+#if !defined(__CUDA_ARCH__)
+  check_error_types<std::exception_ptr>(cudax_async::continue_on(cudax_async::just(potentially_throwing{}), sched));
+#else
+  // No exceptions in device code:
+  check_error_types<>(cudax_async::continue_on(cudax_async::just(potentially_throwing{}), sched));
+#endif
+}
+
+TEST_CASE("continue_on keeps sends_stopped from scheduler's sender", "[adaptors][continue_on]")
+{
+  inline_scheduler sched1{};
+  error_scheduler<error_code> sched2{error_code{std::errc::invalid_argument}};
+  stopped_scheduler sched3{};
+
+  check_sends_stopped<false>(cudax_async::continue_on(cudax_async::just(1), sched1));
+  check_sends_stopped<true>(cudax_async::continue_on(cudax_async::just(2), sched2));
+  check_sends_stopped<true>(cudax_async::continue_on(cudax_async::just(3), sched3));
+}
+} // namespace
diff --git a/cudax/test/async/test_just.cu b/cudax/test/async/test_just.cu
new file mode 100755
index 0000000000..bc54274cbe
--- /dev/null
+++ b/cudax/test/async/test_just.cu
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/__async/async.cuh>
+
+#include "testing.cuh"
+
+TEST_CASE("this is a dummy test", "[just]")
+{
+  CUDAX_REQUIRE(1 == 1);
+}
diff --git a/cudax/test/async/test_sequence.cu b/cudax/test/async/test_sequence.cu
new file mode 100755
index 0000000000..216b8e9f33
--- /dev/null
+++ b/cudax/test/async/test_sequence.cu
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License Version 2.0 with LLVM Exceptions
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   https://llvm.org/LICENSE.txt
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Include this first
+#include <cuda/experimental/__async/async.cuh>
+
+// Then include the test helpers
+#include "common/checked_receiver.cuh"
+#include "common/utility.cuh"
+#include "testing.cuh"
+#include <nv/target>
+
+namespace
+{
+TEST_CASE("simple use of sequence executes both child operations", "[adaptors][sequence]")
+{
+  bool flag1{false};
+  bool flag2{false};
+
+  auto sndr1 = cudax_async::sequence(
+    cudax_async::just() | cudax_async::then([&] {
+      flag1 = true;
+    }),
+    cudax_async::just() | cudax_async::then([&] {
+      flag2 = true;
+    }));
+
+  check_value_types<types<>>(sndr1);
+  check_sends_stopped<false>(sndr1);
+  NV_IF_ELSE_TARGET(NV_IS_HOST, //
+                    ({ check_error_types<std::exception_ptr>(sndr1); }),
+                    ({ check_error_types<>(sndr1); }));
+
+  auto op = cudax_async::connect(std::move(sndr1), checked_value_receiver<>{});
+  cudax_async::start(op);
+
+  CUDAX_CHECK(flag1);
+  CUDAX_CHECK(flag2);
+}
+
+} // namespace
diff --git a/cudax/test/async/test_when_all.cu b/cudax/test/async/test_when_all.cu
new file mode 100755
index 0000000000..9c71dfce07
--- /dev/null
+++ b/cudax/test/async/test_when_all.cu
@@ -0,0 +1,265 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/__async/async.cuh>
+
+#include "common/checked_receiver.cuh"
+#include "common/error_scheduler.cuh"
+#include "common/impulse_scheduler.cuh"
+#include "common/stopped_scheduler.cuh"
+#include "common/utility.cuh"
+#include "testing.cuh"
+
+namespace
+{
+TEST_CASE("when_all simple example", "[when_all]")
+{
+  auto snd  = cudax_async::when_all(cudax_async::just(3), cudax_async::just(0.1415));
+  auto snd1 = std::move(snd) | cudax_async::then([](int x, double y) {
+                return x + y;
+              });
+  auto op   = cudax_async::connect(std::move(snd1), checked_value_receiver{3.1415});
+  cudax_async::start(op);
+}
+
+TEST_CASE("when_all returning two values can be waited on", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(2), cudax_async::just(3));
+  check_values(std::move(snd), 2, 3);
+}
+
+TEST_CASE("when_all with 5 senders", "[when_all]")
+{
+  auto snd = cudax_async::when_all(
+    cudax_async::just(2), cudax_async::just(3), cudax_async::just(5), cudax_async::just(7), cudax_async::just(11));
+  check_values(std::move(snd), 2, 3, 5, 7, 11);
+}
+
+TEST_CASE("when_all with just one sender", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(2));
+  check_values(std::move(snd), 2);
+}
+
+TEST_CASE("when_all with move-only types", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(movable(2)));
+  check_values(std::move(snd), movable(2));
+}
+
+TEST_CASE("when_all with no senders", "[when_all]")
+{
+  auto snd = cudax_async::when_all();
+  check_values(std::move(snd));
+}
+
+TEST_CASE("when_all when one sender sends void", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(2), cudax_async::just());
+  check_values(std::move(snd), 2);
+}
+
+#if !defined(__CUDA_ARCH__)
+
+TEST_CASE("when_all completes when children complete", "[when_all]")
+{
+  impulse_scheduler sched;
+  bool called{false};
+  auto snd = cudax_async::when_all(cudax_async::just(11) | cudax_async::continue_on(sched),
+                                   cudax_async::just(13) | cudax_async::continue_on(sched),
+                                   cudax_async::just(17) | cudax_async::continue_on(sched))
+           | cudax_async::then([&](int a, int b, int c) {
+               called = true;
+               return a + b + c;
+             });
+  auto op = cudax_async::connect(std::move(snd), checked_value_receiver{41});
+  cudax_async::start(op);
+  // The when_all scheduler will complete only after 3 impulses
+  CUDAX_CHECK_FALSE(called);
+  sched.start_next();
+  CUDAX_CHECK_FALSE(called);
+  sched.start_next();
+  CUDAX_CHECK_FALSE(called);
+  sched.start_next();
+  CUDAX_CHECK(called);
+}
+
+#endif
+
+TEST_CASE("when_all can be used with just_*", "[when_all]")
+{
+  auto snd = cudax_async::when_all(cudax_async::just(2), cudax_async::just_error(42), cudax_async::just_stopped());
+  auto op  = cudax_async::connect(std::move(snd), checked_error_receiver{42});
+  cudax_async::start(op);
+}
+
+TEST_CASE("when_all terminates with error if one child terminates with error", "[when_all]")
+{
+  error_scheduler sched{42};
+  auto snd = cudax_async::when_all(
+    cudax_async::just(2), cudax_async::just(5) | cudax_async::continue_on(sched), cudax_async::just(7));
+  auto op = cudax_async::connect(std::move(snd), checked_error_receiver{42});
+  cudax_async::start(op);
+}
+
+TEST_CASE("when_all terminates with stopped if one child is cancelled", "[when_all]")
+{
+  stopped_scheduler sched;
+  auto snd = cudax_async::when_all(
+    cudax_async::just(2), cudax_async::just(5) | cudax_async::continue_on(sched), cudax_async::just(7));
+  auto op = cudax_async::connect(std::move(snd), checked_stopped_receiver{});
+  cudax_async::start(op);
+}
+
+#if !defined(__CUDA_ARCH__)
+
+TEST_CASE("when_all cancels remaining children if error is detected", "[when_all]")
+{
+  impulse_scheduler sched;
+  error_scheduler err_sched{42};
+  bool called1{false};
+  bool called3{false};
+  bool cancelled{false};
+  auto snd = cudax_async::when_all(
+    cudax_async::start_on(sched, cudax_async::just()) | cudax_async::then([&] {
+      called1 = true;
+    }),
+    cudax_async::start_on(sched, cudax_async::just(5) | cudax_async::continue_on(err_sched)),
+    cudax_async::start_on(sched, cudax_async::just()) | cudax_async::then([&] {
+      called3 = true;
+    }) | cudax_async::let_stopped([&] {
+      cancelled = true;
+      return cudax_async::just();
+    }));
+  auto op = cudax_async::connect(std::move(snd), checked_error_receiver{42});
+  cudax_async::start(op);
+  // The first child will complete; the third one will be cancelled
+  CUDAX_CHECK_FALSE(called1);
+  CUDAX_CHECK_FALSE(called3);
+  sched.start_next(); // start the first child
+  CUDAX_CHECK(called1);
+  sched.start_next(); // start the second child; this will generate an error
+  CUDAX_CHECK_FALSE(called3);
+  sched.start_next(); // start the third child
+  CUDAX_CHECK_FALSE(called3);
+  CUDAX_CHECK(cancelled);
+}
+
+TEST_CASE("when_all cancels remaining children if cancel is detected", "[when_all]")
+{
+  stopped_scheduler stopped_sched;
+  impulse_scheduler sched;
+  bool called1{false};
+  bool called3{false};
+  bool cancelled{false};
+  auto snd = cudax_async::when_all(
+    cudax_async::start_on(sched, cudax_async::just()) | cudax_async::then([&] {
+      called1 = true;
+    }),
+    cudax_async::start_on(sched, cudax_async::just(5) | cudax_async::continue_on(stopped_sched)),
+    cudax_async::start_on(sched, cudax_async::just()) | cudax_async::then([&] {
+      called3 = true;
+    }) | cudax_async::let_stopped([&] {
+      cancelled = true;
+      return cudax_async::just();
+    }));
+  auto op = cudax_async::connect(std::move(snd), checked_stopped_receiver{});
+  cudax_async::start(op);
+  // The first child will complete; the third one will be cancelled
+  CUDAX_CHECK_FALSE(called1);
+  CUDAX_CHECK_FALSE(called3);
+  sched.start_next(); // start the first child
+  CUDAX_CHECK(called1);
+  sched.start_next(); // start the second child; this will call set_stopped
+  CUDAX_CHECK_FALSE(called3);
+  sched.start_next(); // start the third child
+  CUDAX_CHECK_FALSE(called3);
+  CUDAX_CHECK(cancelled);
+}
+
+#endif
+
+template <class... Ts>
+struct just_ref
+{
+  using sender_concept        = cudax_async::sender_t;
+  using completion_signatures = cudax_async::completion_signatures<cudax_async::set_value_t(Ts&...)>;
+  _CCCL_HOST_DEVICE just_ref connect(cudax_async::__ignore) const
+  {
+    return {};
+  }
+};
+
+TEST_CASE("when_all has the values_type based on the children, decayed and as rvalue "
+          "references",
+          "[when_all]")
+{
+  check_value_types<types<int>>(cudax_async::when_all(cudax_async::just(13)));
+  check_value_types<types<double>>(cudax_async::when_all(cudax_async::just(3.14)));
+  check_value_types<types<int, double>>(cudax_async::when_all(cudax_async::just(3, 0.14)));
+
+  check_value_types<types<>>(cudax_async::when_all(cudax_async::just()));
+
+  check_value_types<types<int, double>>(cudax_async::when_all(cudax_async::just(3), cudax_async::just(0.14)));
+  check_value_types<types<int, double, int, double>>(
+    cudax_async::when_all(cudax_async::just(3), cudax_async::just(0.14), cudax_async::just(1, 0.4142)));
+
+  // if one child returns void, then the value is simply missing
+  check_value_types<types<int, double>>(
+    cudax_async::when_all(cudax_async::just(3), cudax_async::just(), cudax_async::just(0.14)));
+
+  // if one child has no value completion, the when_all has no value
+  // completion
+  check_value_types<>(
+    cudax_async::when_all(cudax_async::just(3), cudax_async::just_stopped(), cudax_async::just(0.14)));
+
+  // if children send references, they get decayed
+  check_value_types<types<int, double>>(cudax_async::when_all(just_ref<int>(), just_ref<double>()));
+}
+
+TEST_CASE("when_all has the error_types based on the children", "[when_all]")
+{
+  check_error_types<int>(cudax_async::when_all(cudax_async::just_error(13)));
+
+  check_error_types<double>(cudax_async::when_all(cudax_async::just_error(3.14)));
+
+  check_error_types<>(cudax_async::when_all(cudax_async::just()));
+
+  check_error_types<int, double>(cudax_async::when_all(cudax_async::just_error(3), cudax_async::just_error(0.14)));
+
+  check_error_types<int, double, string>(cudax_async::when_all(
+    cudax_async::just_error(3), cudax_async::just_error(0.14), cudax_async::just_error(string{"err"})));
+
+  check_error_types<error_code>(cudax_async::when_all(
+    cudax_async::just(13),
+    cudax_async::just_error(error_code{std::errc::invalid_argument}),
+    cudax_async::just_stopped()));
+
+#if !defined(__CUDA_ARCH__)
+  // if the child sends something with a potentially throwing decay-copy,
+  // the when_all has an exception_ptr error completion.
+  check_error_types<std::exception_ptr>(cudax_async::when_all(just_ref<potentially_throwing>()));
+#else
+  // in device code, there are no exceptions:
+  check_error_types<>(cudax_async::when_all(just_ref<potentially_throwing>()));
+#endif
+}
+
+TEST_CASE("when_all has the sends_stopped == true", "[when_all]")
+{
+  check_sends_stopped<true>(cudax_async::when_all(cudax_async::just(13)));
+  check_sends_stopped<true>(cudax_async::when_all(cudax_async::just_error(-1)));
+  check_sends_stopped<true>(cudax_async::when_all(cudax_async::just_stopped()));
+
+  check_sends_stopped<true>(cudax_async::when_all(cudax_async::just(3), cudax_async::just(0.14)));
+  check_sends_stopped<true>(
+    cudax_async::when_all(cudax_async::just(3), cudax_async::just_error(-1), cudax_async::just_stopped()));
+}
+} // namespace
diff --git a/cudax/test/common/testing.cuh b/cudax/test/common/testing.cuh
index ca4537fd78..f70ffe0b9c 100644
--- a/cudax/test/common/testing.cuh
+++ b/cudax/test/common/testing.cuh
@@ -13,17 +13,23 @@
 
 #include <cuda/experimental/hierarchy.cuh>
 
-#include <exception>
+#include <exception> // IWYU pragma: keep
 #include <iostream>
 #include <sstream>
 
 #include <catch2/catch.hpp>
+#include <nv/target>
 
-namespace cudax = cuda::experimental;
+namespace cuda::experimental::__async
+{
+}
+
+namespace cudax       = cuda::experimental; // NOLINT: misc-unused-alias-decls
+namespace cudax_async = cuda::experimental::__async; // NOLINT: misc-unused-alias-decls
 
 #define CUDART(call) REQUIRE((call) == cudaSuccess)
 
-inline void __device__ cudax_require_impl(
+__device__ inline void cudax_require_impl(
   bool condition, const char* condition_text, const char* filename, unsigned int linenum, const char* funcname)
 {
   if (!condition)
@@ -44,14 +50,24 @@ inline void __device__ cudax_require_impl(
   }
 }
 
-// TODO make it work on NVC++
-#ifdef __CUDA_ARCH__
-#  define CUDAX_REQUIRE(condition) cudax_require_impl(condition, #condition, __FILE__, __LINE__, __PRETTY_FUNCTION__);
-#else
-#  define CUDAX_REQUIRE REQUIRE
-#endif
+#define CUDAX_REQUIRE(condition)                                                                           \
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,                                                                          \
+                    (cudax_require_impl(condition, #condition, __FILE__, __LINE__, __PRETTY_FUNCTION__);), \
+                    (REQUIRE(condition);))
+
+#define CUDAX_CHECK(condition)                                                                             \
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,                                                                          \
+                    (cudax_require_impl(condition, #condition, __FILE__, __LINE__, __PRETTY_FUNCTION__);), \
+                    (CHECK(condition);))
+
+#define CUDAX_FAIL(message) /*                                                                   */ \
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, /*                                                             */ \
+                    (cudax_require_impl(false, message, __FILE__, __LINE__, __PRETTY_FUNCTION__);), \
+                    (FAIL(message);))
+
+#define CUDAX_CHECK_FALSE(condition) CUDAX_CHECK(!(condition))
 
-bool constexpr __host__ __device__ operator==(const dim3& lhs, const dim3& rhs)
+__host__ __device__ constexpr bool operator==(const dim3& lhs, const dim3& rhs) noexcept
 {
   return (lhs.x == rhs.x) && (lhs.y == rhs.y) && (lhs.z == rhs.z);
 }
diff --git a/cudax/test/common/utility.cuh b/cudax/test/common/utility.cuh
index 991d8fd25f..298732e3bf 100644
--- a/cudax/test/common/utility.cuh
+++ b/cudax/test/common/utility.cuh
@@ -19,6 +19,8 @@
 #include <cuda/std/utility>
 #include <cuda/stream_ref>
 
+#include <cuda/experimental/launch.cuh>
+
 #include <new> // IWYU pragma: keep (needed for placement new)
 
 #include "testing.cuh"
@@ -27,31 +29,8 @@ namespace
 {
 namespace test
 {
-struct stream : cuda::stream_ref
-{
-  stream()
-      : cuda::stream_ref(::cudaStream_t{})
-  {
-    ::cudaStream_t stream{};
-    _CCCL_TRY_CUDA_API(::cudaStreamCreate, "failed to create a CUDA stream", &stream);
-    static_cast<cuda::stream_ref&>(*this) = cuda::stream_ref(stream);
-  }
-
-  cuda::stream_ref ref() const noexcept
-  {
-    return *this;
-  }
-
-  void wait() const
-  {
-    _CCCL_TRY_CUDA_API(::cudaStreamSynchronize, "failed to synchronize a CUDA stream", get());
-  }
 
-  ~stream()
-  {
-    [[maybe_unused]] auto status = ::cudaStreamDestroy(get());
-  }
-};
+constexpr auto one_thread_dims = cudax::make_hierarchy(cudax::block_dims<1>(), cudax::grid_dims<1>());
 
 struct _malloc_managed
 {
@@ -144,13 +123,6 @@ struct empty_kernel
   __device__ void operator()() const noexcept {}
 };
 
-/// A kernel that takes a callable object and invokes it with a set of arguments
-template <class Fn, class... Args>
-__global__ void invokernel(Fn fn, Args... args)
-{
-  fn(args...);
-}
-
 inline int count_driver_stack()
 {
   if (cudax::detail::driver::ctxGetCurrent() != nullptr)
diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu
index 56872e0e54..0386588fea 100644
--- a/cudax/test/containers/uninitialized_buffer.cu
+++ b/cudax/test/containers/uninitialized_buffer.cu
@@ -18,15 +18,17 @@
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
-#include <cuda/experimental/buffer>
+#include <cuda/experimental/buffer.cuh>
+#include <cuda/experimental/launch.cuh>
+#include <cuda/experimental/stream.cuh>
 
-#include <catch2/catch.hpp>
+#include "testing.cuh"
 
 struct do_not_construct
 {
   do_not_construct()
   {
-    CHECK(false);
+    CUDAX_CHECK(false);
   }
 };
 
@@ -68,20 +70,20 @@ TEMPLATE_TEST_CASE(
     static_assert(!cuda::std::is_copy_constructible<uninitialized_buffer>::value, "");
     {
       uninitialized_buffer from_count{resource, 42};
-      CHECK(from_count.data() != nullptr);
-      CHECK(from_count.size() == 42);
+      CUDAX_CHECK(from_count.data() != nullptr);
+      CUDAX_CHECK(from_count.size() == 42);
     }
     {
       uninitialized_buffer input{resource, 42};
       const TestType* ptr = input.data();
 
       uninitialized_buffer from_rvalue{cuda::std::move(input)};
-      CHECK(from_rvalue.data() == ptr);
-      CHECK(from_rvalue.size() == 42);
+      CUDAX_CHECK(from_rvalue.data() == ptr);
+      CUDAX_CHECK(from_rvalue.size() == 42);
 
       // Ensure that we properly reset the input buffer
-      CHECK(input.data() == nullptr);
-      CHECK(input.size() == 0);
+      CUDAX_CHECK(input.data() == nullptr);
+      CUDAX_CHECK(input.size() == 0);
     }
   }
 
@@ -96,13 +98,13 @@ TEMPLATE_TEST_CASE(
       const auto* old_input_ptr = input.data();
 
       buf = cuda::std::move(input);
-      CHECK(buf.data() != old_ptr);
-      CHECK(buf.data() == old_input_ptr);
-      CHECK(buf.size() == 42);
-      CHECK(buf.resource() == other_resource);
+      CUDAX_CHECK(buf.data() != old_ptr);
+      CUDAX_CHECK(buf.data() == old_input_ptr);
+      CUDAX_CHECK(buf.size() == 42);
+      CUDAX_CHECK(buf.resource() == other_resource);
 
-      CHECK(input.data() == nullptr);
-      CHECK(input.size() == 0);
+      CUDAX_CHECK(input.data() == nullptr);
+      CUDAX_CHECK(input.size() == 0);
     }
 
     { // Ensure self move assignment doesnt do anything
@@ -110,25 +112,25 @@ TEMPLATE_TEST_CASE(
       const auto* old_ptr = buf.data();
 
       buf = cuda::std::move(buf);
-      CHECK(buf.data() == old_ptr);
-      CHECK(buf.size() == 1337);
+      CUDAX_CHECK(buf.data() == old_ptr);
+      CUDAX_CHECK(buf.size() == 1337);
     }
   }
 
   SECTION("access")
   {
     uninitialized_buffer buf{resource, 42};
-    CHECK(buf.data() != nullptr);
-    CHECK(buf.size() == 42);
-    CHECK(buf.begin() == buf.data());
-    CHECK(buf.end() == buf.begin() + buf.size());
-    CHECK(buf.resource() == resource);
-
-    CHECK(cuda::std::as_const(buf).data() != nullptr);
-    CHECK(cuda::std::as_const(buf).size() == 42);
-    CHECK(cuda::std::as_const(buf).begin() == buf.data());
-    CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size());
-    CHECK(cuda::std::as_const(buf).resource() == resource);
+    CUDAX_CHECK(buf.data() != nullptr);
+    CUDAX_CHECK(buf.size() == 42);
+    CUDAX_CHECK(buf.begin() == buf.data());
+    CUDAX_CHECK(buf.end() == buf.begin() + buf.size());
+    CUDAX_CHECK(buf.resource() == resource);
+
+    CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr);
+    CUDAX_CHECK(cuda::std::as_const(buf).size() == 42);
+    CUDAX_CHECK(cuda::std::as_const(buf).begin() == buf.data());
+    CUDAX_CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size());
+    CUDAX_CHECK(cuda::std::as_const(buf).resource() == resource);
   }
 
   SECTION("properties")
@@ -143,8 +145,8 @@ TEMPLATE_TEST_CASE(
   {
     uninitialized_buffer buf{resource, 42};
     const cuda::std::span<TestType> as_span{buf};
-    CHECK(as_span.data() == buf.data());
-    CHECK(as_span.size() == 42);
+    CUDAX_CHECK(as_span.data() == buf.data());
+    CUDAX_CHECK(as_span.size() == 42);
   }
 
   SECTION("Actually use memory")
@@ -154,7 +156,46 @@ TEMPLATE_TEST_CASE(
       uninitialized_buffer buf{resource, 42};
       thrust::fill(thrust::device, buf.begin(), buf.end(), TestType{2});
       const auto res = thrust::reduce(thrust::device, buf.begin(), buf.end(), TestType{0}, thrust::plus<int>());
-      CHECK(res == TestType{84});
+      CUDAX_CHECK(res == TestType{84});
     }
   }
 }
+
+__global__ void kernel(_CUDA_VSTD::span<int> data)
+{
+  // Touch the memory to be sure it's accessible
+  CUDAX_CHECK(data.size() == 1024);
+  data[0] = 42;
+}
+
+__global__ void const_kernel(_CUDA_VSTD::span<const int> data)
+{
+  // Touch the memory to be sure it's accessible
+  CUDAX_CHECK(data.size() == 1024);
+}
+
+TEST_CASE("uninitialized_buffer is usable with cudax::launch", "[container]")
+{
+  SECTION("non-const")
+  {
+    const int grid_size = 4;
+    cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{cuda::mr::device_memory_resource{}, 1024};
+    auto dimensions = cudax::make_hierarchy(cudax::grid_dims(grid_size), cudax::block_dims<256>());
+
+    cudax::stream stream;
+
+    cudax::launch(stream, dimensions, kernel, buffer);
+  }
+
+  SECTION("const")
+  {
+    const int grid_size = 4;
+    const cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{
+      cuda::mr::device_memory_resource{}, 1024};
+    auto dimensions = cudax::make_hierarchy(cudax::grid_dims(grid_size), cudax::block_dims<256>());
+
+    cudax::stream stream;
+
+    cudax::launch(stream, dimensions, const_kernel, buffer);
+  }
+}
diff --git a/cudax/test/device/arch_traits.cu b/cudax/test/device/arch_traits.cu
new file mode 100644
index 0000000000..33a1570f4e
--- /dev/null
+++ b/cudax/test/device/arch_traits.cu
@@ -0,0 +1,148 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/device.cuh>
+
+#include <testing.cuh>
+
+template <typename Arch>
+__global__ void arch_specific_kernel_mock_do_not_launch()
+{
+  // I will try to pack something like this into an API
+  if constexpr (Arch::compute_capability != cudax::current_arch().compute_capability)
+  {
+    return;
+  }
+
+  [[maybe_unused]] __shared__ int array[Arch::max_shared_memory_per_block / sizeof(int)];
+
+  // constexpr is useless and I can't use intrinsics here :(
+  if constexpr (cudax::current_arch().cluster_supported)
+  {
+    [[maybe_unused]] int dummy;
+    asm volatile("mov.u32 %0, %%cluster_ctarank;" : "=r"(dummy));
+  }
+  if constexpr (cudax::current_arch().redux_intrinisic)
+  {
+    [[maybe_unused]] int dummy1 = 0, dummy2 = 0;
+    asm volatile("redux.sync.add.s32 %0, %1, 0xffffffff;" : "=r"(dummy1) : "r"(dummy2));
+  }
+  if constexpr (cudax::current_arch().cp_async_supported)
+  {
+    asm volatile("cp.async.commit_group;");
+  }
+}
+
+template __global__ void arch_specific_kernel_mock_do_not_launch<cudax::arch<700>>();
+template __global__ void arch_specific_kernel_mock_do_not_launch<cudax::arch<750>>();
+template __global__ void arch_specific_kernel_mock_do_not_launch<cudax::arch<800>>();
+template __global__ void arch_specific_kernel_mock_do_not_launch<cudax::arch<860>>();
+template __global__ void arch_specific_kernel_mock_do_not_launch<cudax::arch<890>>();
+template __global__ void arch_specific_kernel_mock_do_not_launch<cudax::arch<900>>();
+
+template <unsigned int Arch>
+void constexpr compare_static_and_dynamic()
+{
+  using StaticTraits                            = cudax::arch<Arch>;
+  constexpr cudax::arch_traits_t dynamic_traits = cudax::arch_traits(Arch);
+
+  static_assert(sizeof(StaticTraits) == 1);
+
+  static_assert(StaticTraits::max_threads_per_block == dynamic_traits.max_threads_per_block);
+  static_assert(StaticTraits::max_block_dim_x == dynamic_traits.max_block_dim_x);
+  static_assert(StaticTraits::max_block_dim_y == dynamic_traits.max_block_dim_y);
+  static_assert(StaticTraits::max_block_dim_z == dynamic_traits.max_block_dim_z);
+  static_assert(StaticTraits::max_grid_dim_x == dynamic_traits.max_grid_dim_x);
+  static_assert(StaticTraits::max_grid_dim_y == dynamic_traits.max_grid_dim_y);
+  static_assert(StaticTraits::max_grid_dim_z == dynamic_traits.max_grid_dim_z);
+
+  static_assert(StaticTraits::warp_size == dynamic_traits.warp_size);
+  static_assert(StaticTraits::total_constant_memory == dynamic_traits.total_constant_memory);
+  static_assert(StaticTraits::max_resident_grids == dynamic_traits.max_resident_grids);
+  static_assert(StaticTraits::max_shared_memory_per_block == dynamic_traits.max_shared_memory_per_block);
+  static_assert(StaticTraits::gpu_overlap == dynamic_traits.gpu_overlap);
+  static_assert(StaticTraits::can_map_host_memory == dynamic_traits.can_map_host_memory);
+  static_assert(StaticTraits::concurrent_kernels == dynamic_traits.concurrent_kernels);
+  static_assert(StaticTraits::stream_priorities_supported == dynamic_traits.stream_priorities_supported);
+  static_assert(StaticTraits::global_l1_cache_supported == dynamic_traits.global_l1_cache_supported);
+  static_assert(StaticTraits::local_l1_cache_supported == dynamic_traits.local_l1_cache_supported);
+  static_assert(StaticTraits::max_registers_per_block == dynamic_traits.max_registers_per_block);
+  static_assert(StaticTraits::max_registers_per_multiprocessor == dynamic_traits.max_registers_per_multiprocessor);
+  static_assert(StaticTraits::max_registers_per_thread == dynamic_traits.max_registers_per_thread);
+
+  static_assert(StaticTraits::compute_capability_major == dynamic_traits.compute_capability_major);
+  static_assert(StaticTraits::compute_capability_minor == dynamic_traits.compute_capability_minor);
+  static_assert(StaticTraits::compute_capability == dynamic_traits.compute_capability);
+  static_assert(
+    StaticTraits::max_shared_memory_per_multiprocessor == dynamic_traits.max_shared_memory_per_multiprocessor);
+  static_assert(StaticTraits::max_blocks_per_multiprocessor == dynamic_traits.max_blocks_per_multiprocessor);
+  static_assert(StaticTraits::max_warps_per_multiprocessor == dynamic_traits.max_warps_per_multiprocessor);
+  static_assert(StaticTraits::max_threads_per_multiprocessor == dynamic_traits.max_threads_per_multiprocessor);
+  static_assert(StaticTraits::reserved_shared_memory_per_block == dynamic_traits.reserved_shared_memory_per_block);
+  static_assert(StaticTraits::max_shared_memory_per_block_optin == dynamic_traits.max_shared_memory_per_block_optin);
+  static_assert(StaticTraits::cluster_supported == dynamic_traits.cluster_supported);
+  static_assert(StaticTraits::redux_intrinisic == dynamic_traits.redux_intrinisic);
+  static_assert(StaticTraits::elect_intrinsic == dynamic_traits.elect_intrinsic);
+  static_assert(StaticTraits::cp_async_supported == dynamic_traits.cp_async_supported);
+  static_assert(StaticTraits::tma_supported == dynamic_traits.tma_supported);
+
+  constexpr cudax::arch_traits_t casted = StaticTraits{};
+  static_assert(casted.compute_capability == dynamic_traits.compute_capability);
+}
+
+TEST_CASE("Traits", "[device]")
+{
+  compare_static_and_dynamic<700>();
+  compare_static_and_dynamic<750>();
+  compare_static_and_dynamic<800>();
+  compare_static_and_dynamic<860>();
+  compare_static_and_dynamic<890>();
+  compare_static_and_dynamic<900>();
+
+  // Compare arch traits with attributes
+  for (const cudax::device& dev : cudax::devices)
+  {
+    auto traits = dev.arch_traits();
+
+    CUDAX_REQUIRE(traits.max_threads_per_block == dev.attr(cudax::device::attrs::max_threads_per_block));
+    CUDAX_REQUIRE(traits.max_block_dim_x == dev.attr(cudax::device::attrs::max_block_dim_x));
+    CUDAX_REQUIRE(traits.max_block_dim_y == dev.attr(cudax::device::attrs::max_block_dim_y));
+    CUDAX_REQUIRE(traits.max_block_dim_z == dev.attr(cudax::device::attrs::max_block_dim_z));
+    CUDAX_REQUIRE(traits.max_grid_dim_x == dev.attr(cudax::device::attrs::max_grid_dim_x));
+    CUDAX_REQUIRE(traits.max_grid_dim_y == dev.attr(cudax::device::attrs::max_grid_dim_y));
+    CUDAX_REQUIRE(traits.max_grid_dim_z == dev.attr(cudax::device::attrs::max_grid_dim_z));
+
+    CUDAX_REQUIRE(traits.warp_size == dev.attr(cudax::device::attrs::warp_size));
+    CUDAX_REQUIRE(traits.total_constant_memory == dev.attr(cudax::device::attrs::total_constant_memory));
+    CUDAX_REQUIRE(traits.max_shared_memory_per_block == dev.attr(cudax::device::attrs::max_shared_memory_per_block));
+    CUDAX_REQUIRE(traits.gpu_overlap == dev.attr(cudax::device::attrs::gpu_overlap));
+    CUDAX_REQUIRE(traits.can_map_host_memory == dev.attr(cudax::device::attrs::can_map_host_memory));
+    CUDAX_REQUIRE(traits.concurrent_kernels == dev.attr(cudax::device::attrs::concurrent_kernels));
+    CUDAX_REQUIRE(traits.stream_priorities_supported == dev.attr(cudax::device::attrs::stream_priorities_supported));
+    CUDAX_REQUIRE(traits.global_l1_cache_supported == dev.attr(cudax::device::attrs::global_l1_cache_supported));
+    CUDAX_REQUIRE(traits.local_l1_cache_supported == dev.attr(cudax::device::attrs::local_l1_cache_supported));
+    CUDAX_REQUIRE(traits.max_registers_per_block == dev.attr(cudax::device::attrs::max_registers_per_block));
+    CUDAX_REQUIRE(
+      traits.max_registers_per_multiprocessor == dev.attr(cudax::device::attrs::max_registers_per_multiprocessor));
+    CUDAX_REQUIRE(traits.compute_capability_major == dev.attr(cudax::device::attrs::compute_capability_major));
+    CUDAX_REQUIRE(traits.compute_capability_minor == dev.attr(cudax::device::attrs::compute_capability_minor));
+    CUDAX_REQUIRE(traits.compute_capability == dev.attr(cudax::device::attrs::compute_capability));
+    CUDAX_REQUIRE(traits.max_shared_memory_per_multiprocessor
+                  == dev.attr(cudax::device::attrs::max_shared_memory_per_multiprocessor));
+    CUDAX_REQUIRE(
+      traits.max_blocks_per_multiprocessor == dev.attr(cudax::device::attrs::max_blocks_per_multiprocessor));
+    CUDAX_REQUIRE(
+      traits.max_threads_per_multiprocessor == dev.attr(cudax::device::attrs::max_threads_per_multiprocessor));
+    CUDAX_REQUIRE(
+      traits.reserved_shared_memory_per_block == dev.attr(cudax::device::attrs::reserved_shared_memory_per_block));
+    CUDAX_REQUIRE(
+      traits.max_shared_memory_per_block_optin == dev.attr(cudax::device::attrs::max_shared_memory_per_block_optin));
+  }
+}
diff --git a/cudax/test/device/device_smoke.cu b/cudax/test/device/device_smoke.cu
index f725bc7f35..0958b97c35 100644
--- a/cudax/test/device/device_smoke.cu
+++ b/cudax/test/device/device_smoke.cu
@@ -111,7 +111,7 @@ TEST_CASE("Smoke", "[device]")
     ::test_device_attribute<device::attrs::texture_alignment, ::cudaDevAttrTextureAlignment, int>();
     ::test_device_attribute<device::attrs::texture_pitch_alignment, ::cudaDevAttrTexturePitchAlignment, int>();
     ::test_device_attribute<device::attrs::gpu_overlap, ::cudaDevAttrGpuOverlap, bool>();
-    ::test_device_attribute<device::attrs::multi_processor_count, ::cudaDevAttrMultiProcessorCount, int>();
+    ::test_device_attribute<device::attrs::multiprocessor_count, ::cudaDevAttrMultiProcessorCount, int>();
     ::test_device_attribute<device::attrs::kernel_exec_timeout, ::cudaDevAttrKernelExecTimeout, bool>();
     ::test_device_attribute<device::attrs::integrated, ::cudaDevAttrIntegrated, bool>();
     ::test_device_attribute<device::attrs::can_map_host_memory, ::cudaDevAttrCanMapHostMemory, bool>();
@@ -124,7 +124,7 @@ TEST_CASE("Smoke", "[device]")
     ::test_device_attribute<device::attrs::memory_clock_rate, ::cudaDevAttrMemoryClockRate, int>();
     ::test_device_attribute<device::attrs::global_memory_bus_width, ::cudaDevAttrGlobalMemoryBusWidth, int>();
     ::test_device_attribute<device::attrs::l2_cache_size, ::cudaDevAttrL2CacheSize, int>();
-    ::test_device_attribute<device::attrs::max_threads_per_multi_processor,
+    ::test_device_attribute<device::attrs::max_threads_per_multiprocessor,
                             ::cudaDevAttrMaxThreadsPerMultiProcessor,
                             int>();
     ::test_device_attribute<device::attrs::unified_addressing, ::cudaDevAttrUnifiedAddressing, bool>();
@@ -270,6 +270,13 @@ TEST_CASE("Smoke", "[device]")
                      config == device::attrs::numa_config.numa_node));
     }
 #endif
+    SECTION("Compute capability")
+    {
+      int compute_cap       = device_ref(0).attr(device::attrs::compute_capability);
+      int compute_cap_major = device_ref(0).attr(device::attrs::compute_capability_major);
+      int compute_cap_minor = device_ref(0).attr(device::attrs::compute_capability_minor);
+      CUDAX_REQUIRE(compute_cap == 100 * compute_cap_major + 10 * compute_cap_minor);
+    }
   }
 }
 
diff --git a/cudax/test/event/event_smoke.cu b/cudax/test/event/event_smoke.cu
index ddf9b271d1..28e81593d1 100644
--- a/cudax/test/event/event_smoke.cu
+++ b/cudax/test/event/event_smoke.cu
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda/experimental/event.cuh>
+#include <cuda/experimental/stream.cuh>
 
 #include <catch2/catch.hpp>
 #include <utility.cuh>
@@ -72,10 +73,11 @@ TEST_CASE("can use event_ref to record and wait on an event", "[event]")
   const cudax::event_ref ref(ev);
 
   test::managed<int> i(0);
-  test::stream stream;
-  ::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
+  cudax::stream stream;
+  cudax::launch(stream, ::test::one_thread_dims, ::test::assign_42{}, i.get());
   ref.record(stream);
   ref.wait();
+  CUDAX_REQUIRE(ref.is_done());
   CUDAX_REQUIRE(*i == 42);
 
   stream.wait();
@@ -84,33 +86,50 @@ TEST_CASE("can use event_ref to record and wait on an event", "[event]")
 
 TEST_CASE("can construct an event with a stream_ref", "[event]")
 {
-  test::stream stream;
-  cudax::event ev(stream.ref());
+  cudax::stream stream;
+  cudax::event ev(static_cast<cuda::stream_ref>(stream));
   CUDAX_REQUIRE(ev.get() != ::cudaEvent_t{});
 }
 
 TEST_CASE("can wait on an event", "[event]")
 {
-  test::stream stream;
+  cudax::stream stream;
   ::test::managed<int> i(0);
-  ::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
+  cudax::launch(stream, ::test::one_thread_dims, ::test::assign_42{}, i.get());
   cudax::event ev(stream);
   ev.wait();
+  CUDAX_REQUIRE(ev.is_done());
   CUDAX_REQUIRE(*i == 42);
   stream.wait();
 }
 
 TEST_CASE("can take the difference of two timed_event objects", "[event]")
 {
-  test::stream stream;
+  cudax::stream stream;
   ::test::managed<int> i(0);
   cudax::timed_event start(stream);
-  ::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
+  cudax::launch(stream, ::test::one_thread_dims, ::test::assign_42{}, i.get());
   cudax::timed_event end(stream);
   end.wait();
+  CUDAX_REQUIRE(end.is_done());
   CUDAX_REQUIRE(*i == 42);
   auto elapsed = end - start;
   CUDAX_REQUIRE(elapsed.count() >= 0);
   STATIC_REQUIRE(_CUDA_VSTD::is_same_v<decltype(elapsed), _CUDA_VSTD::chrono::nanoseconds>);
   stream.wait();
 }
+
+TEST_CASE("can observe the event in not ready state", "[event]")
+{
+  ::test::managed<int> i(0);
+  ::cuda::atomic_ref atomic_i(*i);
+
+  cudax::stream stream;
+
+  cudax::launch(stream, ::test::one_thread_dims, ::test::spin_until_80{}, i.get());
+  cudax::event ev(stream);
+  CUDAX_REQUIRE(!ev.is_done());
+  atomic_i.store(80);
+  ev.wait();
+  CUDAX_REQUIRE(ev.is_done());
+}
diff --git a/cudax/test/launch/configuration.cu b/cudax/test/launch/configuration.cu
index 693d00ce16..43ec352dbc 100644
--- a/cudax/test/launch/configuration.cu
+++ b/cudax/test/launch/configuration.cu
@@ -26,11 +26,11 @@ cudaLaunchKernelExTestReplacement(const cudaLaunchConfig_t* config, void (*kerne
   replacementCalled = true;
   bool has_cluster  = false;
 
-  CHECK(expectedConfig.numAttrs == config->numAttrs);
-  CHECK(expectedConfig.blockDim == config->blockDim);
-  CHECK(expectedConfig.gridDim == config->gridDim);
-  CHECK(expectedConfig.stream == config->stream);
-  CHECK(expectedConfig.dynamicSmemBytes == config->dynamicSmemBytes);
+  CUDAX_CHECK(expectedConfig.numAttrs == config->numAttrs);
+  CUDAX_CHECK(expectedConfig.blockDim == config->blockDim);
+  CUDAX_CHECK(expectedConfig.gridDim == config->gridDim);
+  CUDAX_CHECK(expectedConfig.stream == config->stream);
+  CUDAX_CHECK(expectedConfig.dynamicSmemBytes == config->dynamicSmemBytes);
 
   for (unsigned int i = 0; i < expectedConfig.numAttrs; ++i)
   {
@@ -44,26 +44,26 @@ cudaLaunchKernelExTestReplacement(const cudaLaunchConfig_t* config, void (*kerne
         switch (expectedAttr.id)
         {
           case cudaLaunchAttributeClusterDimension:
-            CHECK(expectedAttr.val.clusterDim.x == actualAttr.val.clusterDim.x);
-            CHECK(expectedAttr.val.clusterDim.y == actualAttr.val.clusterDim.y);
-            CHECK(expectedAttr.val.clusterDim.z == actualAttr.val.clusterDim.z);
+            CUDAX_CHECK(expectedAttr.val.clusterDim.x == actualAttr.val.clusterDim.x);
+            CUDAX_CHECK(expectedAttr.val.clusterDim.y == actualAttr.val.clusterDim.y);
+            CUDAX_CHECK(expectedAttr.val.clusterDim.z == actualAttr.val.clusterDim.z);
             has_cluster = true;
             break;
           case cudaLaunchAttributeCooperative:
-            CHECK(expectedAttr.val.cooperative == actualAttr.val.cooperative);
+            CUDAX_CHECK(expectedAttr.val.cooperative == actualAttr.val.cooperative);
             break;
           case cudaLaunchAttributePriority:
-            CHECK(expectedAttr.val.priority == actualAttr.val.priority);
+            CUDAX_CHECK(expectedAttr.val.priority == actualAttr.val.priority);
             break;
           default:
-            CHECK(false);
+            CUDAX_CHECK(false);
             break;
         }
         break;
       }
     }
     INFO("Searched attribute is " << expectedAttr.id);
-    CHECK(j != expectedConfig.numAttrs);
+    CUDAX_CHECK(j != expectedConfig.numAttrs);
   }
 
   if (!has_cluster || !skip_device_exec(arch_filter<std::less<int>, 90>))
@@ -103,7 +103,7 @@ auto configuration_test(
   ::cuda::stream_ref stream, const dim3& grid_dims, const dim3& block_dims, const dim3& cluster_dims = dim3())
 {
   auto dims             = make_test_dims<HasCluster>(grid_dims, block_dims, cluster_dims);
-  expectedConfig        = {0};
+  expectedConfig        = {};
   expectedConfig.stream = stream.get();
   if constexpr (HasCluster)
   {
@@ -186,5 +186,5 @@ TEST_CASE("Launch configuration", "[launch]")
   }
 
   CUDART(cudaStreamDestroy(stream));
-  CHECK(replacementCalled);
+  CUDAX_CHECK(replacementCalled);
 }
diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu
index e9c6e7730a..e4c8f0e144 100644
--- a/cudax/test/launch/launch_smoke.cu
+++ b/cudax/test/launch/launch_smoke.cu
@@ -18,7 +18,7 @@ __managed__ bool kernel_run_proof = false;
 void check_kernel_run(cudaStream_t stream)
 {
   CUDART(cudaStreamSynchronize(stream));
-  CHECK(kernel_run_proof);
+  CUDAX_CHECK(kernel_run_proof);
   kernel_run_proof = false;
 }
 
@@ -118,7 +118,7 @@ struct launch_transform_to_int_convertible
         , value_(value)
     {
       // Check that the constructor runs before the kernel is launched
-      CHECK_FALSE(kernel_run_proof);
+      CUDAX_CHECK_FALSE(kernel_run_proof);
     }
 
     // Immovable to ensure that __launch_transform doesn't copy the returned
@@ -129,7 +129,7 @@ struct launch_transform_to_int_convertible
     {
       // Check that the destructor runs after the kernel is launched
       CUDART(cudaStreamSynchronize(stream_));
-      CHECK(kernel_run_proof);
+      CUDAX_CHECK(kernel_run_proof);
     }
 
     using __as_kernel_arg = int;
diff --git a/cudax/test/memory_resource/any_resource.cu b/cudax/test/memory_resource/any_resource.cu
new file mode 100644
index 0000000000..ef6d1ef948
--- /dev/null
+++ b/cudax/test/memory_resource/any_resource.cu
@@ -0,0 +1,307 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/memory_resource.cuh>
+
+#include <cstddef>
+#include <cstdint>
+
+#include "cuda/std/detail/libcxx/include/cstddef"
+#include <catch2/catch.hpp>
+#include <testing.cuh>
+
+using std::size_t;
+using std::uintptr_t;
+
+struct Counts
+{
+  int object_count     = 0;
+  int move_count       = 0;
+  int copy_count       = 0;
+  int allocate_count   = 0;
+  int deallocate_count = 0;
+  int equal_to_count   = 0;
+  int new_count        = 0;
+  int delete_count     = 0;
+
+  friend std::ostream& operator<<(std::ostream& os, const Counts& counts)
+  {
+    return os
+        << "object: " << counts.object_count << ", " //
+        << "move: " << counts.move_count << ", " //
+        << "copy: " << counts.copy_count << ", " //
+        << "allocate: " << counts.allocate_count << ", " //
+        << "deallocate: " << counts.deallocate_count << ", " //
+        << "equal_to: " << counts.equal_to_count << ", " //
+        << "new: " << counts.new_count << ", " //
+        << "delete: " << counts.delete_count;
+  }
+
+  friend bool operator==(const Counts& lhs, const Counts& rhs) noexcept
+  {
+    return lhs.object_count == rhs.object_count && //
+           lhs.move_count == rhs.move_count && //
+           lhs.copy_count == rhs.copy_count && //
+           lhs.allocate_count == rhs.allocate_count && //
+           lhs.deallocate_count == rhs.deallocate_count && //
+           lhs.equal_to_count == rhs.equal_to_count && //
+           lhs.new_count == rhs.new_count && //
+           lhs.delete_count == rhs.delete_count; //
+  }
+
+  friend bool operator!=(const Counts& lhs, const Counts& rhs) noexcept
+  {
+    return !(lhs == rhs);
+  }
+};
+
+struct test_fixture_
+{
+  Counts counts;
+  size_t bytes_ = 0;
+  size_t align_ = 0;
+  static thread_local Counts* counts_;
+
+  test_fixture_() noexcept
+      : counts()
+  {
+    counts_ = &counts;
+  }
+
+  size_t bytes(size_t sz) noexcept
+  {
+    bytes_ = sz;
+    return bytes_;
+  }
+
+  size_t align(size_t align) noexcept
+  {
+    align_ = align;
+    return align_;
+  }
+};
+
+thread_local Counts* test_fixture_::counts_ = nullptr;
+
+template <class>
+using test_fixture = test_fixture_;
+
+template <class T>
+struct test_resource
+{
+  int data;
+  test_fixture_* fixture;
+  T cookie[2] = {0xDEADBEEF, 0xDEADBEEF};
+
+  explicit test_resource(int i, test_fixture_* fix) noexcept
+      : data(i)
+      , fixture(fix)
+  {
+    ++fixture->counts.object_count;
+  }
+
+  test_resource(test_resource&& other) noexcept
+      : data(other.data)
+      , fixture(other.fixture)
+  {
+    other._assert_valid();
+    ++fixture->counts.move_count;
+    ++fixture->counts.object_count;
+    other.cookie[0] = other.cookie[1] = 0x0C07FEFE;
+  }
+
+  test_resource(const test_resource& other) noexcept
+      : data(other.data)
+      , fixture(other.fixture)
+  {
+    other._assert_valid();
+    ++fixture->counts.copy_count;
+    ++fixture->counts.object_count;
+  }
+
+  ~test_resource()
+  {
+    --fixture->counts.object_count;
+  }
+
+  void* allocate(std::size_t bytes, std::size_t align)
+  {
+    _assert_valid();
+    CHECK(bytes == fixture->bytes_);
+    CHECK(align == fixture->align_);
+    ++fixture->counts.allocate_count;
+    return fixture;
+  }
+
+  void deallocate(void* ptr, std::size_t bytes, std::size_t align) noexcept
+  {
+    _assert_valid();
+    CHECK(ptr == fixture);
+    CHECK(bytes == fixture->bytes_);
+    CHECK(align == fixture->align_);
+    ++fixture->counts.deallocate_count;
+    return;
+  }
+
+  friend bool operator==(const test_resource& lhs, const test_resource& rhs)
+  {
+    lhs._assert_valid();
+    rhs._assert_valid();
+    ++lhs.fixture->counts.equal_to_count;
+    return lhs.data == rhs.data;
+  }
+
+  friend bool operator!=(const test_resource& lhs, const test_resource& rhs)
+  {
+    FAIL("any_resource should only be calling operator==");
+    return lhs.data != rhs.data;
+  }
+
+  void _assert_valid() const noexcept
+  {
+    REQUIRE(cookie[0] == 0xDEADBEEF);
+    REQUIRE(cookie[1] == 0xDEADBEEF);
+  }
+
+  static void* operator new(::cuda::std::size_t size)
+  {
+    ++test_fixture_::counts_->new_count;
+    return ::operator new(size);
+  }
+
+  static void operator delete(void* pv) noexcept
+  {
+    ++test_fixture_::counts_->delete_count;
+    return ::operator delete(pv);
+  }
+};
+
+using big_resource   = test_resource<uintptr_t>;
+using small_resource = test_resource<unsigned int>;
+
+static_assert(sizeof(big_resource) > sizeof(cuda::mr::_AnyResourceStorage));
+static_assert(sizeof(small_resource) <= sizeof(cuda::mr::_AnyResourceStorage));
+
+TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]", big_resource, small_resource)
+{
+  using TestResource    = TestType;
+  constexpr bool is_big = sizeof(TestResource) > sizeof(cuda::mr::_AnyResourceStorage);
+
+  SECTION("construct and destruct")
+  {
+    Counts expected{};
+    CHECK(this->counts == expected);
+    {
+      cudax::mr::any_resource<> mr{TestResource{42, this}};
+      expected.new_count += is_big;
+      ++expected.object_count;
+      ++expected.move_count;
+      CHECK(this->counts == expected);
+    }
+    expected.delete_count += is_big;
+    --expected.object_count;
+    CHECK(this->counts == expected);
+  }
+
+  // Reset the counters:
+  this->counts = Counts();
+
+  SECTION("copy and move")
+  {
+    Counts expected{};
+    CHECK(this->counts == expected);
+    {
+      cudax::mr::any_resource<> mr{TestResource{42, this}};
+      expected.new_count += is_big;
+      ++expected.object_count;
+      ++expected.move_count;
+      CHECK(this->counts == expected);
+
+      auto mr2 = mr;
+      expected.new_count += is_big;
+      ++expected.copy_count;
+      ++expected.object_count;
+      CHECK(this->counts == expected);
+      CHECK(mr == mr2);
+      ++expected.equal_to_count;
+      CHECK(this->counts == expected);
+
+      auto mr3 = std::move(mr);
+      expected.move_count += !is_big; // for big resources, move is a pointer swap
+      CHECK(this->counts == expected);
+      CHECK(mr2 == mr3);
+      ++expected.equal_to_count;
+      CHECK(this->counts == expected);
+    }
+    expected.delete_count += 2 * is_big;
+    expected.object_count -= 2;
+    CHECK(this->counts == expected);
+  }
+
+  // Reset the counters:
+  this->counts = Counts();
+
+  SECTION("allocate and deallocate")
+  {
+    Counts expected{};
+    CHECK(this->counts == expected);
+    {
+      cudax::mr::any_resource<> mr{TestResource{42, this}};
+      expected.new_count += is_big;
+      ++expected.object_count;
+      ++expected.move_count;
+      CHECK(this->counts == expected);
+
+      void* ptr = mr.allocate(bytes(50), align(8));
+      CHECK(ptr == this);
+      ++expected.allocate_count;
+      CHECK(this->counts == expected);
+
+      mr.deallocate(ptr, bytes(50), align(8));
+      ++expected.deallocate_count;
+      CHECK(this->counts == expected);
+    }
+    expected.delete_count += is_big;
+    --expected.object_count;
+    CHECK(this->counts == expected);
+  }
+
+  // Reset the counters:
+  this->counts = Counts();
+
+  SECTION("conversion to resource_ref")
+  {
+    Counts expected{};
+    {
+      cudax::mr::any_resource<> mr{TestResource{42, this}};
+      expected.new_count += is_big;
+      ++expected.object_count;
+      ++expected.move_count;
+      CHECK(this->counts == expected);
+
+      cuda::mr::resource_ref<> ref = mr;
+
+      CHECK(this->counts == expected);
+      auto* ptr = ref.allocate(bytes(100), align(8));
+      CHECK(ptr == this);
+      ++expected.allocate_count;
+      CHECK(this->counts == expected);
+      ref.deallocate(ptr, bytes(0), align(0));
+      ++expected.deallocate_count;
+      CHECK(this->counts == expected);
+    }
+    expected.delete_count += is_big;
+    --expected.object_count;
+    CHECK(this->counts == expected);
+  }
+
+  // Reset the counters:
+  this->counts = Counts();
+}
diff --git a/cudax/test/memory_resource/async_memory_pool.cu b/cudax/test/memory_resource/async_memory_pool.cu
new file mode 100644
index 0000000000..9db268c5b8
--- /dev/null
+++ b/cudax/test/memory_resource/async_memory_pool.cu
@@ -0,0 +1,211 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/memory_resource.cuh>
+
+#include <catch2/catch.hpp>
+
+namespace cudax = cuda::experimental;
+using pool      = cudax::mr::async_memory_pool;
+static_assert(!cuda::std::is_trivial<pool>::value, "");
+static_assert(!cuda::std::is_trivially_default_constructible<pool>::value, "");
+static_assert(!cuda::std::is_default_constructible<pool>::value, "");
+static_assert(!cuda::std::is_copy_constructible<pool>::value, "");
+static_assert(!cuda::std::is_move_constructible<pool>::value, "");
+static_assert(!cuda::std::is_copy_assignable<pool>::value, "");
+static_assert(!cuda::std::is_move_assignable<pool>::value, "");
+static_assert(!cuda::std::is_trivially_destructible<pool>::value, "");
+static_assert(!cuda::std::is_empty<pool>::value, "");
+
+static bool ensure_release_threshold(::cudaMemPool_t pool, const size_t expected_threshold)
+{
+  size_t release_threshold = expected_threshold + 1337; // use something different than the expected threshold
+  _CCCL_TRY_CUDA_API(
+    ::cudaMemPoolGetAttribute,
+    "Failed to call cudaMemPoolGetAttribute",
+    pool,
+    ::cudaMemPoolAttrReleaseThreshold,
+    &release_threshold);
+  return release_threshold == expected_threshold;
+}
+
+static bool ensure_disable_reuse(::cudaMemPool_t pool, const int driver_version)
+{
+  int disable_reuse = 0;
+  _CCCL_TRY_CUDA_API(
+    ::cudaMemPoolGetAttribute,
+    "Failed to call cudaMemPoolGetAttribute",
+    pool,
+    ::cudaMemPoolReuseAllowOpportunistic,
+    &disable_reuse);
+
+  constexpr int min_async_version = 11050;
+  return driver_version < min_async_version ? disable_reuse == 0 : disable_reuse != 0;
+}
+
+static bool ensure_export_handle(::cudaMemPool_t pool, const ::cudaMemAllocationHandleType allocation_handle)
+{
+  size_t handle              = 0;
+  const ::cudaError_t status = ::cudaMemPoolExportToShareableHandle(&handle, pool, allocation_handle, 0);
+  ::cudaGetLastError(); // Clear CUDA error state
+
+  // If no export was defined we need to querry cudaErrorInvalidValue
+  return allocation_handle == ::cudaMemHandleTypeNone ? status == ::cudaErrorInvalidValue : status == ::cudaSuccess;
+}
+
+TEST_CASE("async_memory_pool construction", "[memory_resource]")
+{
+  int current_device{};
+  {
+    _CCCL_TRY_CUDA_API(::cudaGetDevice, "Failed to query current device with with cudaGetDevice.", &current_device);
+  }
+
+  int driver_version = 0;
+  {
+    _CCCL_TRY_CUDA_API(::cudaDriverGetVersion, "Failed to call cudaDriverGetVersion", &driver_version);
+  }
+
+  ::cudaMemPool_t current_default_pool{};
+  {
+    _CCCL_TRY_CUDA_API(::cudaDeviceGetDefaultMemPool,
+                       "Failed to call cudaDeviceGetDefaultMemPool",
+                       &current_default_pool,
+                       current_device);
+  }
+
+  using memory_pool = cudax::mr::async_memory_pool;
+  SECTION("Construct from device id")
+  {
+    cudax::mr::async_memory_pool from_device{current_device};
+
+    ::cudaMemPool_t get = from_device.get();
+    CHECK(get != current_default_pool);
+
+    // Ensure we use the right release threshold
+    CHECK(ensure_release_threshold(get, 0));
+
+    // Ensure that we disable reuse with unsupported drivers
+    CHECK(ensure_disable_reuse(get, driver_version));
+
+    // Ensure that we disable export
+    CHECK(ensure_export_handle(get, ::cudaMemHandleTypeNone));
+  }
+
+  SECTION("Construct with empty properties")
+  {
+    cudax::mr::async_memory_pool_properties props{};
+    memory_pool from_defaulted_properties{current_device, props};
+
+    ::cudaMemPool_t get = from_defaulted_properties.get();
+    CHECK(get != current_default_pool);
+
+    // Ensure we use the right release threshold
+    CHECK(ensure_release_threshold(get, 0));
+
+    // Ensure that we disable reuse with unsupported drivers
+    CHECK(ensure_disable_reuse(get, driver_version));
+
+    // Ensure that we disable export
+    CHECK(ensure_export_handle(get, ::cudaMemHandleTypeNone));
+  }
+
+  SECTION("Construct with initial pool size")
+  {
+    cudax::mr::async_memory_pool_properties props = {42, 20};
+    memory_pool with_threshold{current_device, props};
+
+    ::cudaMemPool_t get = with_threshold.get();
+    CHECK(get != current_default_pool);
+
+    // Ensure we use the right release threshold
+    CHECK(ensure_release_threshold(get, props.release_threshold));
+
+    // Ensure that we disable reuse with unsupported drivers
+    CHECK(ensure_disable_reuse(get, driver_version));
+
+    // Ensure that we disable export
+    CHECK(ensure_export_handle(get, ::cudaMemHandleTypeNone));
+  }
+
+  // Allocation handles are only supported after 11.2
+#if !defined(_CCCL_CUDACC_BELOW_11_2)
+  SECTION("Construct with allocation handle")
+  {
+    cudax::mr::async_memory_pool_properties props = {
+      42, 20, cudax::mr::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor};
+    memory_pool with_allocation_handle{current_device, props};
+
+    ::cudaMemPool_t get = with_allocation_handle.get();
+    CHECK(get != current_default_pool);
+
+    // Ensure we use the right release threshold
+    CHECK(ensure_release_threshold(get, props.release_threshold));
+
+    // Ensure that we disable reuse with unsupported drivers
+    CHECK(ensure_disable_reuse(get, driver_version));
+
+    // Ensure that we disable export
+    CHECK(ensure_export_handle(get, static_cast<cudaMemAllocationHandleType>(props.allocation_handle_type)));
+  }
+#endif // !_CCCL_CUDACC_BELOW_11_2
+
+  SECTION("Take ownership of native handle")
+  {
+    ::cudaMemPoolProps pool_properties{};
+    pool_properties.allocType     = ::cudaMemAllocationTypePinned;
+    pool_properties.handleTypes   = ::cudaMemAllocationHandleType(cudaMemAllocationHandleType::cudaMemHandleTypeNone);
+    pool_properties.location.type = ::cudaMemLocationTypeDevice;
+    pool_properties.location.id   = current_device;
+    ::cudaMemPool_t new_pool{};
+    _CCCL_TRY_CUDA_API(::cudaMemPoolCreate, "Failed to call cudaMemPoolCreate", &new_pool, &pool_properties);
+
+    cudax::mr::async_memory_pool from_handle = cudax::mr::async_memory_pool::from_native_handle(new_pool);
+    CHECK(from_handle.get() == new_pool);
+  }
+}
+
+TEST_CASE("async_memory_pool comparison", "[memory_resource]")
+{
+  int current_device{};
+  {
+    _CCCL_TRY_CUDA_API(::cudaGetDevice, "Failed to querry current device with with cudaGetDevice.", &current_device);
+  }
+
+  int driver_version = 0;
+  {
+    _CCCL_TRY_CUDA_API(::cudaDriverGetVersion, "Failed to call cudaDriverGetVersion", &driver_version);
+  }
+
+  ::cudaMemPool_t current_default_pool{};
+  {
+    _CCCL_TRY_CUDA_API(::cudaDeviceGetDefaultMemPool,
+                       "Failed to call cudaDeviceGetDefaultMemPool",
+                       &current_default_pool,
+                       current_device);
+  }
+
+  cudax::mr::async_memory_pool first{current_device};
+  { // comparison against a plain async_memory_pool
+    cudax::mr::async_memory_pool second{current_device};
+    CHECK(first == first);
+    CHECK(first != second);
+  }
+
+  { // comparison against a cudaMemPool_t
+    CHECK(first == first.get());
+    CHECK(first.get() == first);
+    CHECK(first != current_default_pool);
+    CHECK(current_default_pool != first);
+  }
+}
diff --git a/cudax/test/memory_resource/async_memory_resource.cu b/cudax/test/memory_resource/async_memory_resource.cu
new file mode 100644
index 0000000000..7e849c640b
--- /dev/null
+++ b/cudax/test/memory_resource/async_memory_resource.cu
@@ -0,0 +1,490 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/memory_resource.cuh>
+
+#include <stdexcept>
+
+#include <catch2/catch.hpp>
+
+namespace cudax = cuda::experimental;
+
+static_assert(!cuda::std::is_trivial<cudax::mr::async_memory_resource>::value, "");
+static_assert(!cuda::std::is_trivially_default_constructible<cudax::mr::async_memory_resource>::value, "");
+static_assert(cuda::std::is_default_constructible<cudax::mr::async_memory_resource>::value, "");
+static_assert(cuda::std::is_copy_constructible<cudax::mr::async_memory_resource>::value, "");
+static_assert(cuda::std::is_move_constructible<cudax::mr::async_memory_resource>::value, "");
+static_assert(cuda::std::is_copy_assignable<cudax::mr::async_memory_resource>::value, "");
+static_assert(cuda::std::is_move_assignable<cudax::mr::async_memory_resource>::value, "");
+static_assert(cuda::std::is_trivially_destructible<cudax::mr::async_memory_resource>::value, "");
+static_assert(!cuda::std::is_empty<cudax::mr::async_memory_resource>::value, "");
+
+static bool ensure_release_threshold(::cudaMemPool_t pool, const size_t expected_threshold)
+{
+  size_t release_threshold = expected_threshold + 1337; // use something different than the expected threshold
+  _CCCL_TRY_CUDA_API(
+    ::cudaMemPoolGetAttribute,
+    "Failed to call cudaMemPoolGetAttribute",
+    pool,
+    ::cudaMemPoolAttrReleaseThreshold,
+    &release_threshold);
+  return release_threshold == expected_threshold;
+}
+
+static bool ensure_disable_reuse(::cudaMemPool_t pool, const int driver_version)
+{
+  int disable_reuse = 0;
+  _CCCL_TRY_CUDA_API(
+    ::cudaMemPoolGetAttribute,
+    "Failed to call cudaMemPoolGetAttribute",
+    pool,
+    ::cudaMemPoolReuseAllowOpportunistic,
+    &disable_reuse);
+
+  constexpr int min_async_version = 11050;
+  return driver_version < min_async_version ? disable_reuse == 0 : disable_reuse != 0;
+}
+
+static bool ensure_export_handle(::cudaMemPool_t pool, const ::cudaMemAllocationHandleType allocation_handle)
+{
+  size_t handle              = 0;
+  const ::cudaError_t status = ::cudaMemPoolExportToShareableHandle(&handle, pool, allocation_handle, 0);
+  ::cudaGetLastError(); // Clear CUDA error state
+
+  // If no export was defined we need to query cudaErrorInvalidValue
+  return allocation_handle == ::cudaMemHandleTypeNone ? status == ::cudaErrorInvalidValue : status == ::cudaSuccess;
+}
+
+TEST_CASE("async_memory_resource construction", "[memory_resource]")
+{
+  int current_device{};
+  {
+    _CCCL_TRY_CUDA_API(::cudaGetDevice, "Failed to query current device with cudaGetDevice.", &current_device);
+  }
+
+  int driver_version = 0;
+  {
+    _CCCL_TRY_CUDA_API(::cudaDriverGetVersion, "Failed to call cudaDriverGetVersion", &driver_version);
+  }
+
+  ::cudaMemPool_t current_default_pool{};
+  {
+    _CCCL_TRY_CUDA_API(::cudaDeviceGetDefaultMemPool,
+                       "Failed to call cudaDeviceGetDefaultMemPool",
+                       &current_default_pool,
+                       current_device);
+  }
+
+  using async_resource = cuda::experimental::mr::async_memory_resource;
+  SECTION("Default construction")
+  {
+    {
+      async_resource default_constructed{};
+      CHECK(default_constructed.get() == current_default_pool);
+    }
+
+    // Ensure that the pool was not destroyed by allocating something
+    void* ptr{nullptr};
+    _CCCL_TRY_CUDA_API(
+      ::cudaMallocAsync,
+      "Failed to allocate with pool passed to cuda::experimental::mr::async_memory_resource",
+      &ptr,
+      42,
+      current_default_pool,
+      ::cudaStream_t{0});
+    CHECK(ptr != nullptr);
+
+    _CCCL_ASSERT_CUDA_API(
+      ::cudaFreeAsync,
+      "Failed to deallocate with pool passed to cuda::experimental::mr::async_memory_resource",
+      ptr,
+      ::cudaStream_t{0});
+  }
+
+  SECTION("Construct from mempool handle")
+  {
+    ::cudaMemPoolProps pool_properties{};
+    pool_properties.allocType     = ::cudaMemAllocationTypePinned;
+    pool_properties.handleTypes   = ::cudaMemAllocationHandleType(0);
+    pool_properties.location.type = ::cudaMemLocationTypeDevice;
+    pool_properties.location.id   = current_device;
+    cudaMemPool_t cuda_pool_handle{};
+    _CCCL_TRY_CUDA_API(::cudaMemPoolCreate, "Failed to call cudaMemPoolCreate", &cuda_pool_handle, &pool_properties);
+
+    {
+      async_resource from_cudaMemPool{cuda_pool_handle};
+      CHECK(from_cudaMemPool.get() == cuda_pool_handle);
+      CHECK(from_cudaMemPool.get() != current_default_pool);
+    }
+
+    // Ensure that the pool was not destroyed by allocating something
+    void* ptr{nullptr};
+    _CCCL_TRY_CUDA_API(
+      ::cudaMallocAsync,
+      "Failed to allocate with pool passed to cuda::experimental::mr::async_memory_resource",
+      &ptr,
+      42,
+      current_default_pool,
+      ::cudaStream_t{0});
+    CHECK(ptr != nullptr);
+
+    _CCCL_ASSERT_CUDA_API(
+      ::cudaFreeAsync,
+      "Failed to deallocate with pool passed to cuda::experimental::mr::async_memory_resource",
+      ptr,
+      ::cudaStream_t{0});
+  }
+
+  SECTION("Construct with initial pool size")
+  {
+    cuda::experimental::mr::async_memory_pool_properties props = {
+      42,
+    };
+    cuda::experimental::mr::async_memory_pool pool{current_device, props};
+    async_resource from_initial_pool_size{pool};
+
+    ::cudaMemPool_t get = from_initial_pool_size.get();
+    CHECK(get != current_default_pool);
+
+    // Ensure we use the right release threshold
+    CHECK(ensure_release_threshold(get, 0));
+
+    // Ensure that we disable reuse with unsupported drivers
+    CHECK(ensure_disable_reuse(get, driver_version));
+
+    // Ensure that we disable export
+    CHECK(ensure_export_handle(get, ::cudaMemHandleTypeNone));
+  }
+
+  SECTION("Construct with release threshold")
+  {
+    cuda::experimental::mr::async_memory_pool_properties props = {
+      42,
+      20,
+    };
+    cuda::experimental::mr::async_memory_pool pool{current_device, props};
+    async_resource with_threshold{pool};
+
+    ::cudaMemPool_t get = with_threshold.get();
+    CHECK(get != current_default_pool);
+
+    // Ensure we use the right release threshold
+    CHECK(ensure_release_threshold(get, props.release_threshold));
+
+    // Ensure that we disable reuse with unsupported drivers
+    CHECK(ensure_disable_reuse(get, driver_version));
+
+    // Ensure that we disable export
+    CHECK(ensure_export_handle(get, ::cudaMemHandleTypeNone));
+  }
+
+  // Allocation handles are only supported after 11.2
+#if !defined(_CCCL_CUDACC_BELOW_11_2)
+  SECTION("Construct with allocation handle")
+  {
+    cuda::experimental::mr::async_memory_pool_properties props = {
+      42,
+      20,
+      cuda::experimental::mr::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor,
+    };
+    cuda::experimental::mr::async_memory_pool pool{current_device, props};
+    async_resource with_allocation_handle{pool};
+
+    ::cudaMemPool_t get = with_allocation_handle.get();
+    CHECK(get != current_default_pool);
+
+    // Ensure we use the right release threshold
+    CHECK(ensure_release_threshold(get, props.release_threshold));
+
+    // Ensure that we disable reuse with unsupported drivers
+    CHECK(ensure_disable_reuse(get, driver_version));
+
+    // Ensure that we disable export
+    CHECK(ensure_export_handle(get, static_cast<cudaMemAllocationHandleType>(props.allocation_handle_type)));
+  }
+#endif // !_CCCL_CUDACC_BELOW_11_2
+}
+
+static void ensure_device_ptr(void* ptr)
+{
+  CHECK(ptr != nullptr);
+  cudaPointerAttributes attributes;
+  cudaError_t status = cudaPointerGetAttributes(&attributes, ptr);
+  CHECK(status == cudaSuccess);
+  CHECK(attributes.type == cudaMemoryTypeDevice);
+}
+
+TEST_CASE("async_memory_resource allocation", "[memory_resource]")
+{
+  cuda::experimental::mr::async_memory_resource res{};
+
+  { // allocate / deallocate
+    auto* ptr = res.allocate(42);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+    ensure_device_ptr(ptr);
+
+    res.deallocate(ptr, 42);
+  }
+
+  { // allocate / deallocate with alignment
+    auto* ptr = res.allocate(42, 4);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+    ensure_device_ptr(ptr);
+
+    res.deallocate(ptr, 42, 4);
+  }
+
+  { // allocate_async / deallocate_async
+    cudaStream_t raw_stream;
+    cudaStreamCreate(&raw_stream);
+    cuda::stream_ref stream{raw_stream};
+
+    auto* ptr = res.allocate_async(42, stream);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+
+    stream.wait();
+    ensure_device_ptr(ptr);
+
+    res.deallocate_async(ptr, 42, stream);
+    cudaStreamDestroy(raw_stream);
+  }
+
+  { // allocate_async / deallocate_async with alignment
+    cudaStream_t raw_stream;
+    cudaStreamCreate(&raw_stream);
+    cuda::stream_ref stream{raw_stream};
+
+    auto* ptr = res.allocate_async(42, 4, stream);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+
+    stream.wait();
+    ensure_device_ptr(ptr);
+
+    res.deallocate_async(ptr, 42, 4, stream);
+    cudaStreamDestroy(raw_stream);
+  }
+
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+  { // allocate with too small alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate(5, 42);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+
+  { // allocate with non matching alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate(5, 1337);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+  { // allocate_async with too small alignment
+    while (true)
+    {
+      cudaStream_t raw_stream;
+      cudaStreamCreate(&raw_stream);
+      try
+      {
+        auto* ptr = res.allocate_async(5, 42, raw_stream);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        cudaStreamDestroy(raw_stream);
+        break;
+      }
+      CHECK(false);
+    }
+  }
+
+  { // allocate_async with non matching alignment
+    while (true)
+    {
+      cudaStream_t raw_stream;
+      cudaStreamCreate(&raw_stream);
+      try
+      {
+        auto* ptr = res.allocate_async(5, 1337, raw_stream);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        cudaStreamDestroy(raw_stream);
+        break;
+      }
+      CHECK(false);
+    }
+  }
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
+}
+
+enum class AccessibilityType
+{
+  Device,
+  Host,
+};
+
+template <AccessibilityType Accessibilty>
+struct resource
+{
+  void* allocate(size_t, size_t)
+  {
+    return nullptr;
+  }
+  void deallocate(void*, size_t, size_t) {}
+
+  bool operator==(const resource&) const
+  {
+    return true;
+  }
+  bool operator!=(const resource& other) const
+  {
+    return false;
+  }
+
+  template <AccessibilityType Accessibilty2                                         = Accessibilty,
+            cuda::std::enable_if_t<Accessibilty2 == AccessibilityType::Device, int> = 0>
+  friend void get_property(const resource&, cuda::mr::device_accessible) noexcept
+  {}
+};
+static_assert(cuda::mr::resource<resource<AccessibilityType::Host>>, "");
+static_assert(!cuda::mr::resource_with<resource<AccessibilityType::Host>, cuda::mr::device_accessible>, "");
+static_assert(cuda::mr::resource<resource<AccessibilityType::Device>>, "");
+static_assert(cuda::mr::resource_with<resource<AccessibilityType::Device>, cuda::mr::device_accessible>, "");
+
+template <AccessibilityType Accessibilty>
+struct async_resource : public resource<Accessibilty>
+{
+  void* allocate_async(size_t, size_t, cuda::stream_ref)
+  {
+    return nullptr;
+  }
+  void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {}
+};
+static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>, "");
+static_assert(!cuda::mr::async_resource_with<async_resource<AccessibilityType::Host>, cuda::mr::device_accessible>, "");
+static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
+static_assert(cuda::mr::async_resource_with<async_resource<AccessibilityType::Device>, cuda::mr::device_accessible>,
+              "");
+
+TEST_CASE("async_memory_resource comparison", "[memory_resource]")
+{
+  int current_device{};
+  {
+    _CCCL_TRY_CUDA_API(::cudaGetDevice, "Failed to query current device with cudaGetDevice.", &current_device);
+  }
+
+  cuda::experimental::mr::async_memory_resource first{};
+  { // comparison against a plain async_memory_resource
+    cuda::experimental::mr::async_memory_resource second{};
+    CHECK(first == second);
+    CHECK(!(first != second));
+  }
+
+  { // comparison against a plain async_memory_resource with a different pool
+    cudaMemPool_t cuda_pool_handle{};
+    {
+      ::cudaMemPoolProps pool_properties{};
+      pool_properties.allocType     = ::cudaMemAllocationTypePinned;
+      pool_properties.handleTypes   = ::cudaMemAllocationHandleType(0);
+      pool_properties.location.type = ::cudaMemLocationTypeDevice;
+      pool_properties.location.id   = current_device;
+      _CCCL_TRY_CUDA_API(::cudaMemPoolCreate, "Failed to call cudaMemPoolCreate", &cuda_pool_handle, &pool_properties);
+    }
+    cuda::experimental::mr::async_memory_resource second{cuda_pool_handle};
+    CHECK(first != second);
+    CHECK(!(first == second));
+  }
+
+  { // comparison against a async_memory_resource wrapped inside a resource_ref<device_accessible>
+    cuda::experimental::mr::async_memory_resource second{};
+    cuda::mr::resource_ref<cuda::mr::device_accessible> second_ref{second};
+    CHECK(first == second_ref);
+    CHECK(!(first != second_ref));
+    CHECK(second_ref == first);
+    CHECK(!(second_ref != first));
+  }
+
+  { // comparison against a async_memory_resource wrapped inside a resource_ref<>
+    cuda::experimental::mr::async_memory_resource second{};
+    CHECK(first == cuda::mr::resource_ref<>{second});
+    CHECK(!(first != cuda::mr::resource_ref<>{second}));
+    CHECK(cuda::mr::resource_ref<>{second} == first);
+    CHECK(!(cuda::mr::resource_ref<>{second} != first));
+  }
+
+  { // comparison against a async_memory_resource wrapped inside a async_resource_ref
+    cuda::experimental::mr::async_memory_resource second{};
+    cuda::mr::async_resource_ref<cuda::mr::device_accessible> second_ref{second};
+
+    CHECK(first == second_ref);
+    CHECK(!(first != second_ref));
+    CHECK(second_ref == first);
+    CHECK(!(second_ref != first));
+  }
+
+  { // comparison against a async_memory_resource wrapped inside a async_resource_ref<>
+    cuda::experimental::mr::async_memory_resource second{};
+    CHECK(first == cuda::mr::async_resource_ref<>{second});
+    CHECK(!(first != cuda::mr::async_resource_ref<>{second}));
+    CHECK(cuda::mr::async_resource_ref<>{second} == first);
+    CHECK(!(cuda::mr::async_resource_ref<>{second} != first));
+  }
+
+  { // comparison against a different resource through resource_ref
+    resource<AccessibilityType::Host> host_resource{};
+    resource<AccessibilityType::Device> device_resource{};
+    CHECK(!(first == host_resource));
+    CHECK(first != host_resource);
+    CHECK(!(first == device_resource));
+    CHECK(first != device_resource);
+
+    CHECK(!(host_resource == first));
+    CHECK(host_resource != first);
+    CHECK(!(device_resource == first));
+    CHECK(device_resource != first);
+  }
+
+  { // comparison against a different resource through resource_ref
+    async_resource<AccessibilityType::Host> host_async_resource{};
+    async_resource<AccessibilityType::Device> device_async_resource{};
+    CHECK(!(first == host_async_resource));
+    CHECK(first != host_async_resource);
+    CHECK(!(first == device_async_resource));
+    CHECK(first != device_async_resource);
+
+    CHECK(!(host_async_resource == first));
+    CHECK(host_async_resource != first);
+    CHECK(!(device_async_resource == first));
+    CHECK(device_async_resource != first);
+  }
+}
diff --git a/cudax/test/stream/get_stream.cu b/cudax/test/stream/get_stream.cu
index 80834bdd0f..2bcac1b3c5 100644
--- a/cudax/test/stream/get_stream.cu
+++ b/cudax/test/stream/get_stream.cu
@@ -17,14 +17,14 @@ TEST_CASE("Can call get_stream on a cudaStream_t", "[stream]")
 {
   ::cudaStream_t str = nullptr;
   auto ref           = ::cuda::experimental::get_stream(str);
-  CHECK(str == ref);
+  CUDAX_CHECK(str == ref);
 }
 
 TEST_CASE("Can call get_stream on a cudax::stream", "[stream]")
 {
   cudax::stream str;
   auto ref = ::cuda::experimental::get_stream(str);
-  CHECK(str == ref);
+  CUDAX_CHECK(str == ref);
 }
 
 struct something_stream_ordered
@@ -41,7 +41,7 @@ TEST_CASE("Can call get_stream on a type with a get_stream method", "[stream]")
 {
   something_stream_ordered str{};
   auto ref = ::cuda::experimental::get_stream(str);
-  CHECK(str.stream_ == ref);
+  CUDAX_CHECK(str.stream_ == ref);
 }
 
 struct non_const_get_stream
diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu
index 90d7743810..58bf31b6d3 100644
--- a/cudax/test/stream/stream_smoke.cu
+++ b/cudax/test/stream/stream_smoke.cu
@@ -14,13 +14,11 @@
 #include <catch2/catch.hpp>
 #include <utility.cuh>
 
-constexpr auto one_thread_dims = cudax::make_hierarchy(cudax::block_dims<1>(), cudax::grid_dims<1>());
-
 TEST_CASE("Can create a stream and launch work into it", "[stream]")
 {
   cudax::stream str;
   ::test::managed<int> i(0);
-  cudax::launch(str, one_thread_dims, ::test::assign_42{}, i.get());
+  cudax::launch(str, ::test::one_thread_dims, ::test::assign_42{}, i.get());
   str.wait();
   CUDAX_REQUIRE(*i == 42);
 }
@@ -33,7 +31,7 @@ TEST_CASE("From native handle", "[stream]")
     auto stream = cudax::stream::from_native_handle(handle);
 
     ::test::managed<int> i(0);
-    cudax::launch(stream, one_thread_dims, ::test::assign_42{}, i.get());
+    cudax::launch(stream, ::test::one_thread_dims, ::test::assign_42{}, i.get());
     stream.wait();
     CUDAX_REQUIRE(*i == 42);
     (void) stream.release();
@@ -41,19 +39,19 @@ TEST_CASE("From native handle", "[stream]")
   CUDART(cudaStreamDestroy(handle));
 }
 
-TEST_CASE("Can add dependency into a stream", "[stream]")
+template <typename StreamType>
+void add_dependency_test(const StreamType& waiter, const StreamType& waitee)
 {
-  cudax::stream waiter, waitee;
   CUDAX_REQUIRE(waiter != waitee);
 
   auto verify_dependency = [&](const auto& insert_dependency) {
     ::test::managed<int> i(0);
     ::cuda::atomic_ref atomic_i(*i);
 
-    cudax::launch(waitee, one_thread_dims, ::test::spin_until_80{}, i.get());
-    cudax::launch(waitee, one_thread_dims, ::test::assign_42{}, i.get());
+    cudax::launch(waitee, ::test::one_thread_dims, ::test::spin_until_80{}, i.get());
+    cudax::launch(waitee, ::test::one_thread_dims, ::test::assign_42{}, i.get());
     insert_dependency();
-    cudax::launch(waiter, one_thread_dims, ::test::verify_42{}, i.get());
+    cudax::launch(waiter, ::test::one_thread_dims, ::test::verify_42{}, i.get());
     CUDAX_REQUIRE(atomic_i.load() != 42);
     CUDAX_REQUIRE(!waiter.ready());
     atomic_i.store(80);
@@ -93,6 +91,14 @@ TEST_CASE("Can add dependency into a stream", "[stream]")
   }
 }
 
+TEST_CASE("Can add dependency into a stream", "[stream]")
+{
+  cudax::stream waiter, waitee;
+
+  add_dependency_test<cudax::stream>(waiter, waitee);
+  add_dependency_test<cudax::stream_ref>(waiter, waitee);
+}
+
 TEST_CASE("Stream priority", "[stream]")
 {
   cudax::stream stream_default_prio;
@@ -113,4 +119,6 @@ TEST_CASE("Stream get device", "[stream]")
   CUDART(cudaStreamCreate(&stream_handle));
   auto stream_cudart = cudax::stream::from_native_handle(stream_handle);
   CUDAX_REQUIRE(stream_cudart.device() == *std::prev(cudax::devices.end()));
+  auto stream_ref_cudart = cudax::stream_ref(stream_handle);
+  CUDAX_REQUIRE(stream_ref_cudart.device() == *std::prev(cudax::devices.end()));
 }
diff --git a/cudax/test/utility/ensure_current_device.cu b/cudax/test/utility/ensure_current_device.cu
index cdf8effcd6..a0107ac102 100644
--- a/cudax/test/utility/ensure_current_device.cu
+++ b/cudax/test/utility/ensure_current_device.cu
@@ -92,7 +92,7 @@ TEST_CASE("ensure current device", "[device]")
       CUDAX_REQUIRE(driver::ctxGetCurrent() == driver::streamGetCtx(stream.get()));
     }
 
-    CHECK(test::count_driver_stack() == 0);
+    CUDAX_CHECK(test::count_driver_stack() == 0);
 
     {
       // Check NULL stream ref is handled ok
diff --git a/docs/cpp.rst b/docs/cpp.rst
index 453ab1e2f7..cd86acb681 100644
--- a/docs/cpp.rst
+++ b/docs/cpp.rst
@@ -43,7 +43,7 @@ be better served by unifying them into a single repository.
   OpenMP).
 
 - `Cuda Experimental <https://nvidia.github.io/cccl/cudax/>`__
-  is a library of exerimental features that are still in the design process.
+  is a library of experimental features that are still in the design process.
 
 The main goal of the CCCL C++ libraries is to fill a similar role that the
 Standard C++ Library fills for Standard C++: provide general-purpose,
diff --git a/docs/cub/benchmarking.rst b/docs/cub/benchmarking.rst
index 9d1e7c38fd..99fc0fc7fa 100644
--- a/docs/cub/benchmarking.rst
+++ b/docs/cub/benchmarking.rst
@@ -1,7 +1,7 @@
 CUB Benchmarks
 *************************************
 
-This file contains instrutions on how to run all CUB benchmarks using CUB tuning infrastructure.
+This file contains instructions on how to run all CUB benchmarks using CUB tuning infrastructure.
 
 .. code-block:: bash
 
diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst
index 0a1163bbc1..0d196e1b9c 100644
--- a/docs/cub/developer_overview.rst
+++ b/docs/cub/developer_overview.rst
@@ -726,7 +726,7 @@ To satisfy (1), the visibility of ``thrust::cuda_cub::launcher::triple_chevron``
 
 To satisfy (2), instead of annotating kernels as ``__global__`` we annotate them as
 ``CUB_DETAIL_KERNEL_ATTRIBUTES``. Apart from annotating a kernel as global function, the macro also
-contains an attribute to set the visibility to bidden.
+contains an attribute to set the visibility to hidden.
 
 To satisfy (3), CUB symbols are placed inside an inline namespace containing the set of
 GPU architectures for which the TU is being compiled.
diff --git a/docs/cub/test_overview.rst b/docs/cub/test_overview.rst
index 53fafbd4e5..ee40c5707e 100644
--- a/docs/cub/test_overview.rst
+++ b/docs/cub/test_overview.rst
@@ -95,7 +95,7 @@ but are modified to provide more stable behavior in some testing edge cases.
     unless the test code is being used for documentation examples.
 
 Similarly, any thrust algorithms that executed on the device must be invoked with the
-`c2h::device_policy`` execution policy (not shown here) to support the same edge cases.
+`c2h::device_policy` execution policy (not shown here) to support the same edge cases.
 The memory is filled with random data in (2).
 
 Generator ``c2h::gen`` takes at least two parameters.
@@ -327,4 +327,4 @@ Overall, the test will produce two executables.
 Each of these executables is going to generate ``2`` input problem sizes.
 For each problem size, ``3`` random vectors are generated.
 As a result, we have ``12`` different tests.
-The code also demonstrates the syntax and usage of ``c2h::device_policy`` with a Thrust alorithm.
+The code also demonstrates the syntax and usage of ``c2h::device_policy`` with a Thrust algorithm.
diff --git a/docs/cub/tuning.rst b/docs/cub/tuning.rst
index 248a2ccbf4..32d9cc2ec7 100644
--- a/docs/cub/tuning.rst
+++ b/docs/cub/tuning.rst
@@ -37,7 +37,7 @@ Contributing Benchmarks
 
 There are a few constraints on benchmarks. First of all, all benchmarks in a single
 file should share type axes. Only alphabetical characters, numbers and underscore are allowed in the
-benchmark name.  The name of the file is represets the name of the algorithm.
+benchmark name.  The name of the file represents the name of the algorithm.
 For instance, the :code:`benchmarks/bench/radix_sort/keys.cu` file name is going to be transformed
 into :code:`cub.bench.radix_sort.keys` that's further used in the infrastructure.
 
diff --git a/docs/cuda_parallel/index.rst b/docs/cuda_parallel/index.rst
new file mode 100644
index 0000000000..5a76a3d35d
--- /dev/null
+++ b/docs/cuda_parallel/index.rst
@@ -0,0 +1,13 @@
+.. _cuda_parallel-module:
+
+CUDA Parallel
+==================================================
+
+.. warning::
+  Python exposure of parallel algorithms is in public beta.
+  The API is subject to change without notice.
+
+.. automodule:: cuda.parallel.experimental
+  :members:
+  :undoc-members:
+  :imported-members:
diff --git a/docs/cudax/container.rst b/docs/cudax/container.rst
index 3a0ab05f45..66a2ec6ad8 100644
--- a/docs/cudax/container.rst
+++ b/docs/cudax/container.rst
@@ -20,6 +20,6 @@ annotations are checked by the type system.
    :widths: 25 45 30
    :header-rows: 0
 
-   * - :ref:`<cuda/experimental/buffer> <cudax-containers-uninitialized-buffer>`
+   * - :ref:`<cuda/experimental/buffer.cuh> <cudax-containers-uninitialized-buffer>`
      - Facilities providing uninitialized *heterogeneous* storage satisfying a set of properties
      - cudax 2.7.0 / CCCL 2.7.0
diff --git a/docs/cudax/index.rst b/docs/cudax/index.rst
index 9b066b61c0..b9df0486db 100644
--- a/docs/cudax/index.rst
+++ b/docs/cudax/index.rst
@@ -8,6 +8,7 @@ CUDA Experimental
    :maxdepth: 3
 
    container
+   memory_resource
    ${repo_docs_api_path}/cudax_api
 
 ``CUDA Experimental`` (``cudax``) provides experimental new features that are still in development and subject to change.
@@ -15,6 +16,8 @@ However, any feature within this library has important use cases and we encourag
 
 Specifically, ``cudax`` provides:
    - :ref:`uninitialized storage <cudax-containers-uninitialized-buffer>`
+   - :ref:`an owning type erased memory resource <cudax-memory-resource-async-any-resource>`
+   - :ref:`stream-ordered memory resources <cudax-memory-resource-async>`
    - dimensions description functionality
 
 Stability Guarantees
diff --git a/docs/cudax/memory_resource.rst b/docs/cudax/memory_resource.rst
new file mode 100644
index 0000000000..580fe6cd23
--- /dev/null
+++ b/docs/cudax/memory_resource.rst
@@ -0,0 +1,28 @@
+.. _cudax-memory-resource:
+
+Memory Resources
+=================
+
+.. toctree::
+   :glob:
+   :maxdepth: 3
+
+   ${repo_docs_api_path}/*any__resource*
+   ${repo_docs_api_path}/enum*async__memory__pool*
+   ${repo_docs_api_path}/struct*async__memory__pool__properties*
+   ${repo_docs_api_path}/class*async__memory__pool*
+   ${repo_docs_api_path}/class*async__memory__resource*
+
+The ``<cuda/experimental/memory_resource.cuh>`` header provides:
+   -  :ref:`any_resource <cudax-memory-resource-any-resource>` and
+      :ref:`async_any_resource <cudax-memory-resource-async-any-resource>` type erased memory resources similar to
+      ``std::any``. In contrast to :ref:`resource_ref <libcudacxx-extended-api-memory-resources-resource-ref>` they
+      own the contained resource.
+   -  :ref:`async_memory_resource <cudax-memory-resource-async>` A standard C++ interface for *heterogeneous*,
+      *stream-ordered* memory allocation tailored to the needs of CUDA C++ developers. This design builds off of the
+      success of the `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ project and evolves the design
+      based on lessons learned.
+
+``<cuda/experimental/memory_resource.cuh>`` is not intended to replace RMM, but instead moves the definition of the
+memory allocation interface to a more centralized home in CCCL. RMM will remain as a collection of implementations of
+the ``cuda::mr`` interfaces.
diff --git a/docs/libcudacxx/extended_api/memory_model.rst b/docs/libcudacxx/extended_api/memory_model.rst
index dfb6ed6789..6dd4097747 100644
--- a/docs/libcudacxx/extended_api/memory_model.rst
+++ b/docs/libcudacxx/extended_api/memory_model.rst
@@ -61,33 +61,39 @@ An atomic operation is atomic at the scope it specifies if:
    - it specifies a scope other than ``thread_scope_system``, **or**
    - the scope is ``thread_scope_system`` and:
 
-      -  it affects an object in `unified
+      -  it affects an object in `system allocated memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd>`__ and `pageableMemoryAccess <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gg49e2f8c2c0bd6fe264f2fc970912e5cddc80992427a92713e699953a6d249d6f>`__ is ``1`` [0],  **or**
+      -  it affects an object in `managed
          memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd>`__
          and
          `concurrentManagedAccess <https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_116f9619ccc85e93bc456b8c69c80e78b>`__
          is ``1``, **or**
-      -  it affects an object in CPU memory and
+      -  it affects an object in `mapped
+         memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory>`__ and
          `hostNativeAtomicSupported <https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f>`__
          is ``1``, **or**
       -  it is a load or store that affects a naturally-aligned object of
          sizes ``1``, ``2``, ``4``, ``8``, or ``16`` bytes on `mapped
-         memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory>`__,
+         memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory>`__ [1],
          **or**
-      -  it affects an object in GPU memory and only GPU threads access it.
+      -  it affects an object in GPU memory, only GPU threads access it, and
+          - `p2pNativeAtomicSupported <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gg2f597e2acceab33f60bd61c41fea0c1b8513982962e4439fa60f2a24348be587>`__ between each accessing GPU and the GPU where the object resides is ``1``, or
+          - only GPU threads from a single GPU concurrently access it.
 
 .. note::
-   If `hostNativeAtomicSupported` is `0`, atomic load or store operations at system scope that affect a
-   naturally-aligned 16-byte wide object in
-   `unified memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd>`__ or
-   `mapped memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory>`__ require system
-   support. NVIDIA is not aware of any system that lacks this support and there is no CUDA API query available to
-   detect such systems.
+   - [0] If `PageableMemoryAccessUsesHostPagetables <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1gg49e2f8c2c0bd6fe264f2fc970912e5cdc228cf8983c97d0e035da72a71494eaa>`__ is ``0`` then atomic operations to memory mapped file or ``hugetlbfs`` allocations are not atomic.
+   - [1] If `hostNativeAtomicSupported <https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html#structcudaDeviceProp_1ef82fd7d1d0413c7d6f33287e5b6306f>`__ is ``0``, atomic load or store operations at system scope that affect a
+     naturally-aligned 16-byte wide object in
+     `unified memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd>`__ or
+     `mapped memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory>`__ require system
+     support. NVIDIA is not aware of any system that lacks this support and there is no CUDA API query available to
+     detect such systems.
 
 Refer to the `CUDA programming guide <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html>`__
 for more information on
-`unified memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd>`__,
+`system allocated memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd>`__,
+`managed memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd>`__,
 `mapped memory <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mapped-memory>`__,
-CPU memory, and GPU peer memory.
+CPU memory, and GPU memory.
 
 Data Races
 ----------
diff --git a/docs/libcudacxx/extended_api/memory_resource/properties.rst b/docs/libcudacxx/extended_api/memory_resource/properties.rst
index 0b185b9813..ac396d3ffa 100644
--- a/docs/libcudacxx/extended_api/memory_resource/properties.rst
+++ b/docs/libcudacxx/extended_api/memory_resource/properties.rst
@@ -87,7 +87,7 @@ base type at all. This common use case is covered by ``cuda::forward_property``,
            std::cout << "allocating\n";
            return base.allocate(size, alignment);
        }
-       void deallocate(void* ptr, std::size_t size, std::size_t alignment) {
+       void deallocate(void* ptr, std::size_t size, std::size_t alignment) noexcept {
            std::cout << "deallocating\n";
            return base.deallocate(ptr, size, alignment);
        }
diff --git a/docs/libcudacxx/extended_api/memory_resource/resource.rst b/docs/libcudacxx/extended_api/memory_resource/resource.rst
index be424afc6a..6608584854 100644
--- a/docs/libcudacxx/extended_api/memory_resource/resource.rst
+++ b/docs/libcudacxx/extended_api/memory_resource/resource.rst
@@ -24,7 +24,7 @@ To demonstrate, the following example defines several resources, only some of wh
 
    struct valid_resource {
      void* allocate(std::size_t, std::size_t) { return nullptr; }
-     void deallocate(void*, std::size_t, std::size_t) {}
+     void deallocate(void*, std::size_t, std::size_t) noexcept {}
      bool operator==(const valid_resource&) const { return true; }
      // NOTE: C++20 thankfully added operator rewrite rules so defining operator!= is not required.
      // However, if compiled with C++14 / C++17, operator != must also be defined.
@@ -35,34 +35,34 @@ To demonstrate, the following example defines several resources, only some of wh
    struct invalid_argument {};
    struct invalid_allocate_argument {
      void* allocate(invalid_argument, std::size_t) { return nullptr; }
-     void deallocate(void*, std::size_t, std::size_t) {}
+     void deallocate(void*, std::size_t, std::size_t) noexcept {}
      bool operator==(const invalid_allocate_argument&) { return true; }
    };
    static_assert(!cuda::mr::resource<invalid_allocate_argument>, "");
 
    struct invalid_allocate_return {
      int allocate(std::size_t, std::size_t) { return 42; }
-     void deallocate(void*, std::size_t, std::size_t) {}
+     void deallocate(void*, std::size_t, std::size_t) noexcept {}
      bool operator==(const invalid_allocate_return&) { return true; }
    };
    static_assert(!cuda::mr::resource<invalid_allocate_return>, "");
 
    struct invalid_deallocate_argument {
      void* allocate(std::size_t, std::size_t) { return nullptr; }
-     void deallocate(void*, invalid_argument, std::size_t) {}
+     void deallocate(void*, invalid_argument, std::size_t) noexcept {}
      bool operator==(const invalid_deallocate_argument&) { return true; }
    };
    static_assert(!cuda::mr::resource<invalid_deallocate_argument>, "");
 
    struct non_comparable {
      void* allocate(std::size_t, std::size_t) { return nullptr; }
-     void deallocate(void*, std::size_t, std::size_t) {}
+     void deallocate(void*, std::size_t, std::size_t) noexcept {}
    };
    static_assert(!cuda::mr::resource<non_comparable>, "");
 
    struct non_eq_comparable {
      void* allocate(std::size_t, std::size_t) { return nullptr; }
-     void deallocate(void*, std::size_t, std::size_t) {}
+     void deallocate(void*, std::size_t, std::size_t) noexcept {}
      bool operator!=(const non_eq_comparable&) { return false; }
    };
    static_assert(!cuda::mr::resource<non_eq_comparable>, "");
@@ -75,7 +75,7 @@ In addition to the `std::pmr::memory_resource <https://en.cppreference.com/w/cpp
 
    struct valid_resource {
      void* allocate(std::size_t, std::size_t) { return nullptr; }
-     void deallocate(void*, std::size_t, std::size_t) {}
+     void deallocate(void*, std::size_t, std::size_t) noexcept {}
      void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; }
      void deallocate_async(void*, std::size_t, std::size_t, cuda::stream_ref) {}
      bool operator==(const valid_resource&) const { return true; }
@@ -109,7 +109,7 @@ concept. The ``{async_}resource_with`` concept allows checking resources for arb
    };
    struct my_memory_resource {
        void* allocate(std::size_t, std::size_t) { return nullptr; }
-       void deallocate(void*, std::size_t, std::size_t) {}
+       void deallocate(void*, std::size_t, std::size_t) noexcept {}
        bool operator==(const my_memory_resource&) const { return true; }
        bool operator!=(const my_memory_resource&) const { return false; }
 
diff --git a/docs/libcudacxx/setup/requirements.rst b/docs/libcudacxx/setup/requirements.rst
index 6d1af6864f..f8da901b09 100644
--- a/docs/libcudacxx/setup/requirements.rst
+++ b/docs/libcudacxx/setup/requirements.rst
@@ -85,7 +85,6 @@ architectures:
 
 -  aarch64.
 -  x86-64.
--  ppc64le.
 
 Host Operating Systems
 ~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/libcudacxx/standard_api/container_library.rst b/docs/libcudacxx/standard_api/container_library.rst
index 78c223b352..90991b093c 100644
--- a/docs/libcudacxx/standard_api/container_library.rst
+++ b/docs/libcudacxx/standard_api/container_library.rst
@@ -8,6 +8,7 @@ Container Library
    :maxdepth: 1
 
    container_library/array
+   container_library/inplace_vector
    container_library/mdspan
    container_library/span
 
@@ -23,6 +24,9 @@ Any Standard C++ header not listed below is omitted.
    * - `\<cuda/std/array\> <https://en.cppreference.com/w/cpp/header/array>`_
      - Fixed size array
      - libcu++ 1.8.0 / CCCL 2.0.0 / CUDA 11.7
+   * - `\<cuda/std/inplace_vector\> <https://en.cppreference.com/w/cpp/header/inplace_vector>`_
+     - Flexible size container with fixed capacity
+     - libcu++ 2.6.0 / CCCL 2.6.0
    * - `\<cuda/std/span\> <https://en.cppreference.com/w/cpp/header/span>`_
      - Non - owning view into a contiguous sequence of objects
      - libcu++ 2.1.0 / CCCL 2.1.0 / CUDA 12.2
diff --git a/docs/libcudacxx/standard_api/container_library/inplace_vector.rst b/docs/libcudacxx/standard_api/container_library/inplace_vector.rst
new file mode 100644
index 0000000000..ad29594b6c
--- /dev/null
+++ b/docs/libcudacxx/standard_api/container_library/inplace_vector.rst
@@ -0,0 +1,14 @@
+.. _libcudacxx-standard-api-container-inplace-vector:
+
+``<cuda/std/inplace_vector>``
+==============================
+
+Extensions
+----------
+
+-  Most features of ``<inplace_vector>`` are made available in C++14 onwards
+
+Restrictions
+------------
+
+-  The range based interface is only available with ranges support in C++17
diff --git a/docs/python.rst b/docs/python.rst
index b0b9c5b73f..164fdaf829 100644
--- a/docs/python.rst
+++ b/docs/python.rst
@@ -8,8 +8,12 @@ CUDA Python Core Libraries
    :maxdepth: 3
 
    cuda.cooperative <https://nvidia.github.io/cccl/cuda_cooperative/>
+   cuda.parallel <https://nvidia.github.io/cccl/cuda_parallel/>
 
 Welcome to the CUDA Core Compute Libraries (CCCL) libraries for Python.
 
 - `cuda.cooperative <https://nvidia.github.io/cccl/cuda_cooperative/>`__
   is a still-experimental library exposing cooperative algorithms to Python.
+
+- `cuda.parallel <https://nvidia.github.io/cccl/cuda_parallel/>`__
+  is a still-experimental library exposing parallel algorithms to Python.
diff --git a/docs/repo.toml b/docs/repo.toml
index 0741089ceb..f4c7fa4d77 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -25,7 +25,7 @@ sphinx_exclude_patterns = [
     "VERSION.md",
 ]
 
-project_build_order = [ "libcudacxx", "cudax", "cub", "thrust", "cccl", "cuda_cooperative" ]
+project_build_order = [ "libcudacxx", "cudax", "cub", "thrust", "cccl", "cuda_cooperative", "cuda_parallel" ]
 
 # deps can be used to link to other projects' documentation
 deps = [
@@ -33,6 +33,7 @@ deps = [
     [ "cub", "_build/docs/cub/latest" ],
     [ "thrust", "_build/docs/thrust/latest" ],
     [ "cuda_cooperative", "_build/docs/cuda_cooperative/latest" ],
+    [ "cuda_parallel", "_build/docs/cuda_parallel/latest" ],
 ]
 
 [repo_docs.projects.libcudacxx]
@@ -144,7 +145,9 @@ doxygen_predefined = [
     "_CCCL_DEVICE",
     "_CCCL_HOST_DEVICE",
     "_CCCL_FORCEINLINE",
+    "_CUB_TEMPLATE_REQUIRES(x)",
     "_CCCL_STD_VER",
+    "_CCCL_NODISCARD",
     "_CCCL_VISIBILITY_HIDDEN",
     "_CCCL_SUPPRESS_DEPRECATED_PUSH",
     "_CCCL_SUPPRESS_DEPRECATED_POP",
@@ -301,7 +304,30 @@ autodoc.mock_imports = [
 
 enhanced_search_enabled = true
 python_paths = [
-    "${root}/../python/cuda"
+    "${root}/../python/cuda_cooperative"
+]
+
+[repo_docs.projects.cuda_parallel]
+name = "cuda.parallel"
+docs_root = "cuda_parallel"
+logo = "../img/logo.png"
+
+repo_url         = "https://github.com/NVIDIA/cccl/python/cuda"
+social_media_set = ""
+social_media     = [
+    [ "github", "https://github.com/NVIDIA/cccl" ],
+]
+
+autodoc.mock_imports = [
+    "numba",
+    "pynvjitlink",
+    "cuda.nvrtc",
+    "llvmlite"
+]
+
+enhanced_search_enabled = true
+python_paths = [
+    "${root}/../python/cuda_parallel"
 ]
 
 [repo_docs.projects.cudax]
@@ -328,24 +354,40 @@ deps = [
 ]
 
 doxygen_input = [
-    "../../cudax/include/cuda/experimental/__container/*.h",
-    "../../cudax/include/cuda/experimental/__memory_resource/*.h",
+    "../../cudax/include/cuda/experimental/__container/*.cuh",
+    "../../cudax/include/cuda/experimental/__device/*.cuh",
+    "../../cudax/include/cuda/experimental/__event/*.cuh",
+    "../../cudax/include/cuda/experimental/__hierarchy/*.cuh",
+    "../../cudax/include/cuda/experimental/__launch/*.cuh",
+    "../../cudax/include/cuda/experimental/__memory_resource/*.cuh",
+    "../../cudax/include/cuda/experimental/__stream/*.cuh",
 ]
 
 doxygen_predefined = [
+  "__device__",
+  "__host__",
+  "__global__",
   "_CCCL_DEVICE=",
   "_CCCL_EXEC_CHECK_DISABLE=",
   "_CCCL_FORCEINLINE=",
+  "_CCCL_GLOBAL_CONSTANT=constexpr",
   "_CCCL_HOST=",
   "_CCCL_HOST_DEVICE=",
+  "_CCCL_IF_CONSTEXPR=if constexpr",
   "_CCCL_NODISCARD=[[nodiscard]]",
   "_CCCL_NODISCARD_FRIEND=",
   "_CCCL_STD_VER=2020",
+  "_CCCL_TRAIT(x, y)=x<y>::value",
   "_CUDA_VMR=cuda::mr",
   "_CUDA_VSTD=cuda::std",
-  "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE=",
+  "_LIBCUDACXX_AND=&&",
   "_LIBCUDACXX_EAT_REST(x)=",
+  "_LIBCUDACXX_GLOBAL_CONSTANT=inline",
+  "_LIBCUDACXX_INLINE_VAR=inline",
+  "_LIBCUDACXX_REQUIRES(x)= ::cuda::std::__enable_if_t<x, int> = 0>",
+  "_LIBCUDACXX_TEMPLATE(x)=template<x, ",
   "_LIBCUDACXX_TRAILING_REQUIRES(x)=-> x _LIBCUDACXX_EAT_REST",
+  "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE=",
 ]
 
 # make sure to use ./fetch_imgs.sh
@@ -357,6 +399,6 @@ doxygen_conf_extra = """
   EXTENSION_MAPPING      = cuh=c++ cu=c++
   EXAMPLE_RECURSIVE      = NO
   EXAMPLE_PATTERNS       = *.cu
-  EXCLUDE_SYMBOLS        = "__*"
+  EXCLUDE_SYMBOLS        = "*detail*" "__*" "_A*" "_B*" "_C*" "_D*" "_E*" "_F*" "_G*" "_H*" "_I*" "_J*" "_K*" "_L*" "_M*" "_N*" "_O*" "_P*" "_Q*" "_R*" "_S*" "_T*" "_U*" "_V*" "_W*" "_X*" "_Y*" "_Z*"
   AUTOLINK_SUPPORT       = YES
 """
diff --git a/docs/thrust/api_docs/utility/type_traits.rst b/docs/thrust/api_docs/utility/type_traits.rst
index 3351d27151..1ce609ffb3 100644
--- a/docs/thrust/api_docs/utility/type_traits.rst
+++ b/docs/thrust/api_docs/utility/type_traits.rst
@@ -8,7 +8,4 @@ Type Traits
    :maxdepth: 1
 
    ${repo_docs_api_path}/*struct*proclaim__contiguous__iterator*
-   ${repo_docs_api_path}/*struct*conjunction*
-   ${repo_docs_api_path}/*struct*disjunction*
-   ${repo_docs_api_path}/*struct*negation*
    ${repo_docs_api_path}/*typedef_group__type__traits*
diff --git a/docs/thrust/cmake_options.rst b/docs/thrust/cmake_options.rst
index a3dab2487b..f5dfb2209a 100644
--- a/docs/thrust/cmake_options.rst
+++ b/docs/thrust/cmake_options.rst
@@ -69,6 +69,19 @@ Generic CMake Options
    -  If true, installation rules will be generated for thrust. Default
       is ``ON``.
 
+-  ``THRUST_DISPATCH_TYPE={Dynamic, Force32bit, Force64bit}``
+
+   -  Allows the user to force Thrust to use a specific size for the offset type. Default
+      is ``Dynamic``.
+
+      -  ``Dynamic`` lets Thrust choose the index type based on input size, allowing
+         large inputs and optimal performance at the cost of increased compile time and binary size,
+         as Thrust will compile each kernel twice, once for 32 bit and once for 64 bit.
+      -  ``Force32bit`` forces Thrust to use a 32 bit offset type. This improves compile time and
+         binary size but limits the input size.
+      -  ``Force64bit`` forces Thrust to use a 64 bit offset type. This improves compile time and
+         binary size and allows large input sizes. However, it might degrade runtime performance.
+
 Single Config CMake Options
 ---------------------------
 
diff --git a/docs/thrust/requirements.rst b/docs/thrust/requirements.rst
index 9185858614..32f21d0867 100644
--- a/docs/thrust/requirements.rst
+++ b/docs/thrust/requirements.rst
@@ -60,7 +60,6 @@ Thrust and CUB support the following host architectures:
 
 -  aarch64.
 -  x86-64.
--  ppc64le.
 
 Host Operating Systems
 ~~~~~~~~~~~~~~~~~~~~~~
diff --git a/lib/cmake/cccl/cccl-config-version.cmake b/lib/cmake/cccl/cccl-config-version.cmake
index d79660c771..4634b85a36 100644
--- a/lib/cmake/cccl/cccl-config-version.cmake
+++ b/lib/cmake/cccl/cccl-config-version.cmake
@@ -1,5 +1,5 @@
 set(CCCL_VERSION_MAJOR 2)
-set(CCCL_VERSION_MINOR 6)
+set(CCCL_VERSION_MINOR 7)
 set(CCCL_VERSION_PATCH 0)
 set(CCCL_VERSION_TWEAK 0)
 
diff --git a/libcudacxx/cmake/GetHostTriple.cmake b/libcudacxx/cmake/GetHostTriple.cmake
index 4b812e9c58..2437b7f439 100644
--- a/libcudacxx/cmake/GetHostTriple.cmake
+++ b/libcudacxx/cmake/GetHostTriple.cmake
@@ -14,15 +14,6 @@ function( get_host_triple var )
     else()
       set( value "i686-pc-windows-gnu" )
     endif()
-  elseif( CMAKE_SYSTEM_NAME MATCHES "OS390" )
-    set( value "s390x-ibm-zos" )
-  elseif( CMAKE_SYSTEM_NAME STREQUAL AIX )
-    # We defer to dynamic detection of the host AIX version.
-    if( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-      set( value "powerpc64-ibm-aix" )
-    else()
-      set( value "powerpc-ibm-aix" )
-    endif()
   else( MSVC )
     if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows AND NOT MSYS)
       message(WARNING "unable to determine host target triple")
diff --git a/libcudacxx/include/cuda/__cccl_config b/libcudacxx/include/cuda/__cccl_config
index b4111c732d..5389c64c9c 100644
--- a/libcudacxx/include/cuda/__cccl_config
+++ b/libcudacxx/include/cuda/__cccl_config
@@ -11,16 +11,17 @@
 #ifndef _CUDA__CCCL_CONFIG
 #define _CUDA__CCCL_CONFIG
 
-#include <cuda/std/__cccl/attributes.h>
-#include <cuda/std/__cccl/compiler.h>
-#include <cuda/std/__cccl/diagnostic.h>
-#include <cuda/std/__cccl/dialect.h>
-#include <cuda/std/__cccl/execution_space.h>
-#include <cuda/std/__cccl/extended_floating_point.h>
-#include <cuda/std/__cccl/ptx_isa.h>
-#include <cuda/std/__cccl/sequence_access.h>
-#include <cuda/std/__cccl/system_header.h>
-#include <cuda/std/__cccl/version.h>
-#include <cuda/std/__cccl/visibility.h>
+#include <cuda/std/__cccl/attributes.h> // IWYU pragma: export
+#include <cuda/std/__cccl/compiler.h> // IWYU pragma: export
+#include <cuda/std/__cccl/diagnostic.h> // IWYU pragma: export
+#include <cuda/std/__cccl/dialect.h> // IWYU pragma: export
+#include <cuda/std/__cccl/exceptions.h> // IWYU pragma: export
+#include <cuda/std/__cccl/execution_space.h> // IWYU pragma: export
+#include <cuda/std/__cccl/extended_floating_point.h> // IWYU pragma: export
+#include <cuda/std/__cccl/ptx_isa.h> // IWYU pragma: export
+#include <cuda/std/__cccl/sequence_access.h> // IWYU pragma: export
+#include <cuda/std/__cccl/system_header.h> // IWYU pragma: export
+#include <cuda/std/__cccl/version.h> // IWYU pragma: export
+#include <cuda/std/__cccl/visibility.h> // IWYU pragma: export
 
 #endif // _CUDA__CCCL_CONFIG
diff --git a/libcudacxx/include/cuda/__functional/proclaim_return_type.h b/libcudacxx/include/cuda/__functional/proclaim_return_type.h
new file mode 100644
index 0000000000..186afd5022
--- /dev/null
+++ b/libcudacxx/include/cuda/__functional/proclaim_return_type.h
@@ -0,0 +1,105 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___FUNCTIONAL_PROCLAIM_RETURN_TYPE_H
+#define _CUDA___FUNCTIONAL_PROCLAIM_RETURN_TYPE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__functional/invoke.h>
+#include <cuda/std/__type_traits/decay.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__utility/forward.h>
+#include <cuda/std/__utility/move.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+namespace __detail
+{
+
+template <class _Ret, class _DecayFn>
+class __return_type_wrapper
+{
+private:
+  _DecayFn __fn_;
+
+public:
+  __return_type_wrapper() = delete;
+
+  template <class _Fn,
+            class = _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::is_same<_CUDA_VSTD::__decay_t<_Fn>, _DecayFn>::value>>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit __return_type_wrapper(_Fn&& __fn) noexcept
+      : __fn_(_CUDA_VSTD::forward<_Fn>(__fn))
+  {}
+
+  template <class... _As>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) & noexcept
+  {
+#if !defined(_CCCL_CUDA_COMPILER_NVCC) || defined(__CUDA_ARCH__)
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<_DecayFn&, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
+#endif // !_CCCL_CUDA_COMPILER_NVCC || __CUDA_ARCH__
+
+    return _CUDA_VSTD::__invoke(__fn_, _CUDA_VSTD::forward<_As>(__as)...);
+  }
+
+  template <class... _As>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) && noexcept
+  {
+#if !defined(_CCCL_CUDA_COMPILER_NVCC) || defined(__CUDA_ARCH__)
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<_DecayFn, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
+#endif // !_CCCL_CUDA_COMPILER_NVCC || __CUDA_ARCH__
+
+    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_), _CUDA_VSTD::forward<_As>(__as)...);
+  }
+
+  template <class... _As>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) const& noexcept
+  {
+#if !defined(_CCCL_CUDA_COMPILER_NVCC) || defined(__CUDA_ARCH__)
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<const _DecayFn&, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
+#endif // !_CCCL_CUDA_COMPILER_NVCC || __CUDA_ARCH__
+
+    return _CUDA_VSTD::__invoke(__fn_, _CUDA_VSTD::forward<_As>(__as)...);
+  }
+
+  template <class... _As>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) const&& noexcept
+  {
+#if !defined(_CCCL_CUDA_COMPILER_NVCC) || defined(__CUDA_ARCH__)
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<const _DecayFn, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
+#endif // !_CCCL_CUDA_COMPILER_NVCC || __CUDA_ARCH__
+
+    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_), _CUDA_VSTD::forward<_As>(__as)...);
+  }
+};
+
+} // namespace __detail
+
+template <class _Ret, class _Fn>
+_LIBCUDACXX_HIDE_FROM_ABI __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>
+proclaim_return_type(_Fn&& __fn) noexcept
+{
+  return __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>(_CUDA_VSTD::forward<_Fn>(__fn));
+}
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___FUNCTIONAL_PROCLAIM_RETURN_TYPE_H
diff --git a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
index 02e367e041..485cc080dc 100644
--- a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
@@ -31,9 +31,10 @@
 #  include <cuda/__memory_resource/properties.h>
 #  include <cuda/__memory_resource/resource.h>
 #  include <cuda/__memory_resource/resource_ref.h>
+#  include <cuda/std/__concepts/__concept_macros.h>
 #  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/__cuda/ensure_current_device.h>
-#  include <cuda/std/__new/bad_alloc.h>
+#  include <cuda/std/detail/libcxx/include/stdexcept>
 
 #  if _CCCL_STD_VER >= 2014
 
@@ -48,7 +49,7 @@ class device_memory_resource
 
 public:
   //! @brief default constructs a device_memory_resource allocating memory on device 0
-  device_memory_resource() = default;
+  _CCCL_HIDE_FROM_ABI device_memory_resource() = default;
 
   //! @brief default constructs a device_memory_resource allocating memory on device \p __device_id
   //! @param __device_id The id of the device we are allocating memory on
@@ -59,14 +60,14 @@ class device_memory_resource
   //! @brief Allocate device memory of size at least \p __bytes.
   //! @param __bytes The size in bytes of the allocation.
   //! @param __alignment The requested alignment of the allocation.
-  //! @throw std::bad_alloc in case of invalid alignment or \c cuda::cuda_error of the returned error code.
+  //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code.
   //! @return Pointer to the newly allocated memory
   _CCCL_NODISCARD void* allocate(const size_t __bytes, const size_t __alignment = default_cuda_malloc_alignment) const
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     if (!__is_valid_alignment(__alignment))
     {
-      _CUDA_VSTD::__throw_bad_alloc();
+      _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to device_memory_resource::allocate.");
     }
 
     // We need to ensure that we allocate on the right device as `cudaMalloc` always uses the current device
@@ -81,7 +82,7 @@ class device_memory_resource
   //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
   //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
   //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
-  void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const
+  void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
@@ -107,33 +108,39 @@ class device_memory_resource
   }
 #    endif // _CCCL_STD_VER <= 2017
 
+#    if _CCCL_STD_VER >= 2020
   //! @brief Equality comparison between a \c device_memory_resource and another resource
-  //! @param __lhs The \c device_memory_resource
   //! @param __rhs The resource to compare to
   //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
   //! resources. Otherwise, returns false.
+  _LIBCUDACXX_TEMPLATE(class _Resource)
+  _LIBCUDACXX_REQUIRES(__different_resource<device_memory_resource, _Resource>)
+  _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
+  {
+    return resource_ref<>{const_cast<device_memory_resource*>(this)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
+  }
+#    else // ^^^ C++20 ^^^ / vvv C++17
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<device_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-#    if _CCCL_STD_VER <= 2017
-  //! @copydoc device_memory_resource::operator==<_Resource>(device_memory_resource const&, _Resource const&)
+
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<device_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc device_memory_resource::operator==<_Resource>(device_memory_resource const&, _Resource const&)
+
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<device_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc device_memory_resource::operator==<_Resource>(device_memory_resource const&, _Resource const&)
+
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>)
diff --git a/libcudacxx/include/cuda/__memory_resource/get_property.h b/libcudacxx/include/cuda/__memory_resource/get_property.h
index d39c6ac0d3..a2a72631d5 100644
--- a/libcudacxx/include/cuda/__memory_resource/get_property.h
+++ b/libcudacxx/include/cuda/__memory_resource/get_property.h
@@ -124,14 +124,14 @@ struct __fn
   _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Property)
   _LIBCUDACXX_REQUIRES((!property_with_value<_Property>) _LIBCUDACXX_AND has_property<_Upstream, _Property>)
-  _LIBCUDACXX_INLINE_VISIBILITY friend constexpr void get_property(const _Derived&, _Property) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr void get_property(const _Derived&, _Property) noexcept {}
 
   // The indirection is needed, otherwise the compiler might believe that _Derived is an incomplete type
   _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Property, class _Derived2 = _Derived)
   _LIBCUDACXX_REQUIRES(property_with_value<_Property> _LIBCUDACXX_AND has_property<_Upstream, _Property> _LIBCUDACXX_AND
                          __has_upstream_resource<_Derived2, _Upstream>)
-  _LIBCUDACXX_INLINE_VISIBILITY friend constexpr __property_value_t<_Property>
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr __property_value_t<_Property>
   get_property(const _Derived& __res, _Property __prop)
   {
     return get_property(__res.upstream_resource(), __prop);
diff --git a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
index d899ab95a2..977012669a 100644
--- a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
@@ -31,8 +31,9 @@
 #  include <cuda/__memory_resource/properties.h>
 #  include <cuda/__memory_resource/resource.h>
 #  include <cuda/__memory_resource/resource_ref.h>
+#  include <cuda/std/__concepts/__concept_macros.h>
 #  include <cuda/std/__cuda/api_wrapper.h>
-#  include <cuda/std/__new/bad_alloc.h>
+#  include <cuda/std/detail/libcxx/include/stdexcept>
 
 #  if _CCCL_STD_VER >= 2014
 
@@ -56,14 +57,14 @@ class managed_memory_resource
   //! @brief Allocate CUDA unified memory of size at least \p __bytes.
   //! @param __bytes The size in bytes of the allocation.
   //! @param __alignment The requested alignment of the allocation.
-  //! @throw cuda::cuda_error of the returned error code
+  //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code.
   //! @return Pointer to the newly allocated memory
   _CCCL_NODISCARD void* allocate(const size_t __bytes, const size_t __alignment = default_cuda_malloc_alignment) const
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     if (!__is_valid_alignment(__alignment))
     {
-      _CUDA_VSTD::__throw_bad_alloc();
+      _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to managed_memory_resource::allocate.");
     }
 
     void* __ptr{nullptr};
@@ -73,10 +74,10 @@ class managed_memory_resource
   }
 
   //! @brief Deallocate memory pointed to by \p __ptr.
-  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`.
   //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
   //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
-  void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const
+  void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
@@ -85,53 +86,56 @@ class managed_memory_resource
     (void) __alignment;
   }
 
-  //! @brief Equality comparison with another \c managed_memory_resource
-  //! @param __other The other \c managed_memory_resource
-  //! @return Whether both \c managed_memory_resource were constructed with the same flags
+  //! @brief Equality comparison with another \c managed_memory_resource.
+  //! @param __other The other \c managed_memory_resource.
+  //! @return Whether both \c managed_memory_resource were constructed with the same flags.
   _CCCL_NODISCARD constexpr bool operator==(managed_memory_resource const& __other) const noexcept
   {
     return __flags_ == __other.__flags_;
   }
 #    if _CCCL_STD_VER <= 2017
-  //! @brief Inequality comparison with another \c managed_memory_resource
-  //! @param __other The other \c managed_memory_resource
-  //! @return Whether both \c managed_memory_resource were constructed with different flags
+  //! @brief Inequality comparison with another \c managed_memory_resource.
+  //! @param __other The other \c managed_memory_resource.
+  //! @return Whether both \c managed_memory_resource were constructed with different flags.
   _CCCL_NODISCARD constexpr bool operator!=(managed_memory_resource const& __other) const noexcept
   {
     return __flags_ != __other.__flags_;
   }
 #    endif // _CCCL_STD_VER <= 2017
 
+#    if _CCCL_STD_VER >= 2020
   //! @brief Equality comparison between a \c managed_memory_resource and another resource
-  //! @param __lhs The \c managed_memory_resource
   //! @param __rhs The resource to compare to
   //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
   //! resources. Otherwise, returns false.
+  _LIBCUDACXX_TEMPLATE(class _Resource)
+  _LIBCUDACXX_REQUIRES(__different_resource<managed_memory_resource, _Resource>)
+  _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
+  {
+    return resource_ref<>{const_cast<managed_memory_resource*>(this)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
+  }
+#    else // ^^^ C++20 ^^^ / vvv C++17
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<managed_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-#    if _CCCL_STD_VER <= 2017
-  //! @copydoc managed_memory_resource::operator<_Resource>==(managed_memory_resource const&, _Resource
-  //! const&)
+
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<managed_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc managed_memory_resource::operator<_Resource>==(managed_memory_resource const&, _Resource
-  //! const&)
+
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<managed_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc managed_memory_resource::operator<_Resource>==(managed_memory_resource const&, _Resource
-  //! const&)
+
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
diff --git a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
index c33ad10235..557acc9a67 100644
--- a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
@@ -32,8 +32,9 @@
 #  include <cuda/__memory_resource/properties.h>
 #  include <cuda/__memory_resource/resource.h>
 #  include <cuda/__memory_resource/resource_ref.h>
+#  include <cuda/std/__concepts/__concept_macros.h>
 #  include <cuda/std/__cuda/api_wrapper.h>
-#  include <cuda/std/__new/bad_alloc.h>
+#  include <cuda/std/detail/libcxx/include/stdexcept>
 
 #  if _CCCL_STD_VER >= 2014
 
@@ -58,7 +59,7 @@ class pinned_memory_resource
   //! @brief Allocate host memory of size at least \p __bytes.
   //! @param __bytes The size in bytes of the allocation.
   //! @param __alignment The requested alignment of the allocation.
-  //! @throw cuda::cuda_error if allocation fails with a CUDA error.
+  //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code.
   //! @return Pointer to the newly allocated memory
   _CCCL_NODISCARD void* allocate(const size_t __bytes,
                                  const size_t __alignment = default_cuda_malloc_host_alignment) const
@@ -66,7 +67,7 @@ class pinned_memory_resource
     // We need to ensure that the provided alignment matches the minimal provided alignment
     if (!__is_valid_alignment(__alignment))
     {
-      _CUDA_VSTD::__throw_bad_alloc();
+      _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to pinned_memory_resource::allocate.");
     }
 
     void* __ptr{nullptr};
@@ -75,10 +76,11 @@ class pinned_memory_resource
   }
 
   //! @brief Deallocate memory pointed to by \p __ptr.
-  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`.
   //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
   //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
-  void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_host_alignment) const
+  void
+  deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_host_alignment) const noexcept
   {
     // We need to ensure that the provided alignment matches the minimal provided alignment
     _LIBCUDACXX_ASSERT(__is_valid_alignment(__alignment),
@@ -87,50 +89,56 @@ class pinned_memory_resource
     (void) __alignment;
   }
 
-  //! @brief Equality comparison with another \c pinned_memory_resource
-  //! @param __other The other \c pinned_memory_resource
-  //! @return Whether both \c pinned_memory_resource were constructed with the same flags
+  //! @brief Equality comparison with another \c pinned_memory_resource.
+  //! @param __other The other \c pinned_memory_resource.
+  //! @return Whether both \c pinned_memory_resource were constructed with the same flags.
   _CCCL_NODISCARD constexpr bool operator==(pinned_memory_resource const& __other) const noexcept
   {
     return __flags_ == __other.__flags_;
   }
 #    if _CCCL_STD_VER <= 2017
-  //! @brief Equality comparison with another \c pinned_memory_resource
-  //! @param __other The other \c pinned_memory_resource
-  //! @return Whether both \c pinned_memory_resource were constructed with different flags
+  //! @brief Equality comparison with another \c pinned_memory_resource.
+  //! @param __other The other \c pinned_memory_resource.
+  //! @return Whether both \c pinned_memory_resource were constructed with different flags.
   _CCCL_NODISCARD constexpr bool operator!=(pinned_memory_resource const& __other) const noexcept
   {
     return __flags_ != __other.__flags_;
   }
 #    endif // _CCCL_STD_VER <= 2017
 
+#    if _CCCL_STD_VER >= 2020
   //! @brief Equality comparison between a \c pinned_memory_resource and another resource
-  //! @param __lhs The \c pinned_memory_resource
   //! @param __rhs The resource to compare to
   //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
   //! resources. Otherwise, returns false.
+  _LIBCUDACXX_TEMPLATE(class _Resource)
+  _LIBCUDACXX_REQUIRES(__different_resource<pinned_memory_resource, _Resource>)
+  _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
+  {
+    return resource_ref<>{const_cast<pinned_memory_resource*>(this)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
+  }
+#    else // ^^^ C++20 ^^^ / vvv C++17
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<pinned_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-#    if _CCCL_STD_VER <= 2017
-  //! @copydoc pinned_memory_resource::operator<_Resource>==(pinned_memory_resource const&, _Resource const&)
+
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<pinned_memory_resource&>(__lhs)} == resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc pinned_memory_resource::operator<_Resource>==(pinned_memory_resource const&, _Resource const&)
+
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
   {
     return resource_ref<>{const_cast<pinned_memory_resource&>(__lhs)} != resource_ref<>{const_cast<_Resource&>(__rhs)};
   }
-  //! @copydoc pinned_memory_resource::operator<_Resource>==(pinned_memory_resource const&, _Resource const&)
+
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept
     _LIBCUDACXX_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
diff --git a/libcudacxx/include/cuda/__memory_resource/resource.h b/libcudacxx/include/cuda/__memory_resource/resource.h
index 0692269b80..fd2fa58d60 100644
--- a/libcudacxx/include/cuda/__memory_resource/resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/resource.h
@@ -24,6 +24,7 @@
 #if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 #  include <cuda/__memory_resource/get_property.h>
+#  include <cuda/std/__concepts/__concept_macros.h>
 #  include <cuda/std/__concepts/all_of.h>
 #  include <cuda/std/__concepts/convertible_to.h>
 #  include <cuda/std/__concepts/equality_comparable.h>
diff --git a/libcudacxx/include/cuda/__memory_resource/resource_ref.h b/libcudacxx/include/cuda/__memory_resource/resource_ref.h
index c6fce54b62..278654250e 100644
--- a/libcudacxx/include/cuda/__memory_resource/resource_ref.h
+++ b/libcudacxx/include/cuda/__memory_resource/resource_ref.h
@@ -25,10 +25,15 @@
 
 #  include <cuda/__memory_resource/get_property.h>
 #  include <cuda/__memory_resource/resource.h>
+#  include <cuda/std/__concepts/__concept_macros.h>
 #  include <cuda/std/__concepts/_One_of.h>
 #  include <cuda/std/__concepts/all_of.h>
 #  include <cuda/std/__memory/addressof.h>
 #  include <cuda/std/__type_traits/is_base_of.h>
+#  include <cuda/std/__type_traits/is_nothrow_move_constructible.h>
+#  include <cuda/std/__type_traits/type_set.h>
+#  include <cuda/std/__utility/exchange.h>
+#  include <cuda/std/__utility/move.h>
 #  include <cuda/std/cstddef>
 #  include <cuda/stream_ref>
 
@@ -36,6 +41,42 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
 
+union _AnyResourceStorage
+{
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _AnyResourceStorage(void* __ptr = nullptr) noexcept
+      : __ptr_(__ptr)
+  {}
+
+  void* __ptr_;
+  char __buf_[3 * sizeof(void*)];
+};
+
+template <class _Resource>
+constexpr bool _IsSmall() noexcept
+{
+  return (sizeof(_Resource) <= sizeof(_AnyResourceStorage)) //
+      && (alignof(_AnyResourceStorage) % alignof(_Resource) == 0)
+      && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_move_constructible, _Resource);
+}
+
+template <class _Resource>
+constexpr _Resource* _Any_resource_cast(_AnyResourceStorage* __object) noexcept
+{
+  return static_cast<_Resource*>(_IsSmall<_Resource>() ? __object->__buf_ : __object->__ptr_);
+}
+
+template <class _Resource>
+constexpr const _Resource* _Any_resource_cast(const _AnyResourceStorage* __object) noexcept
+{
+  return static_cast<const _Resource*>(_IsSmall<_Resource>() ? __object->__buf_ : __object->__ptr_);
+}
+
+enum class _WrapperType
+{
+  _Reference,
+  _Owning
+};
+
 enum class _AllocType
 {
   _Default,
@@ -45,17 +86,35 @@ enum class _AllocType
 struct _Alloc_vtable
 {
   using _AllocFn   = void* (*) (void*, size_t, size_t);
-  using _DeallocFn = void (*)(void*, void*, size_t, size_t);
+  using _DeallocFn = void (*)(void*, void*, size_t, size_t) _LIBCUDACXX_FUNCTION_TYPE_NOEXCEPT;
   using _EqualFn   = bool (*)(void*, void*);
+  using _DestroyFn = void (*)(_AnyResourceStorage*) _LIBCUDACXX_FUNCTION_TYPE_NOEXCEPT;
+  using _MoveFn    = void (*)(_AnyResourceStorage*, _AnyResourceStorage*) _LIBCUDACXX_FUNCTION_TYPE_NOEXCEPT;
+  using _CopyFn    = void (*)(_AnyResourceStorage*, const _AnyResourceStorage*);
 
+  bool __is_small;
   _AllocFn __alloc_fn;
   _DeallocFn __dealloc_fn;
   _EqualFn __equal_fn;
-
-  constexpr _Alloc_vtable(_AllocFn __alloc_fn_, _DeallocFn __dealloc_fn_, _EqualFn __equal_fn_) noexcept
-      : __alloc_fn(__alloc_fn_)
+  _DestroyFn __destroy_fn;
+  _MoveFn __move_fn;
+  _CopyFn __copy_fn;
+
+  constexpr _Alloc_vtable(
+    bool __is_small_,
+    _AllocFn __alloc_fn_,
+    _DeallocFn __dealloc_fn_,
+    _EqualFn __equal_fn_,
+    _DestroyFn __destroy_fn_,
+    _MoveFn __move_fn_,
+    _CopyFn __copy_fn_) noexcept
+      : __is_small(__is_small_)
+      , __alloc_fn(__alloc_fn_)
       , __dealloc_fn(__dealloc_fn_)
       , __equal_fn(__equal_fn_)
+      , __destroy_fn(__destroy_fn_)
+      , __move_fn(__move_fn_)
+      , __copy_fn(__copy_fn_)
   {}
 };
 
@@ -68,79 +127,180 @@ struct _Async_alloc_vtable : public _Alloc_vtable
   _AsyncDeallocFn __async_dealloc_fn;
 
   constexpr _Async_alloc_vtable(
+    bool __is_small_,
     _Alloc_vtable::_AllocFn __alloc_fn_,
     _Alloc_vtable::_DeallocFn __dealloc_fn_,
     _Alloc_vtable::_EqualFn __equal_fn_,
+    _Alloc_vtable::_DestroyFn __destroy_fn_,
+    _Alloc_vtable::_MoveFn __move_fn_,
+    _Alloc_vtable::_CopyFn __copy_fn_,
     _AsyncAllocFn __async_alloc_fn_,
     _AsyncDeallocFn __async_dealloc_fn_) noexcept
-      : _Alloc_vtable(__alloc_fn_, __dealloc_fn_, __equal_fn_)
+      : _Alloc_vtable(__is_small_, __alloc_fn_, __dealloc_fn_, __equal_fn_, __destroy_fn_, __move_fn_, __copy_fn_)
       , __async_alloc_fn(__async_alloc_fn_)
       , __async_dealloc_fn(__async_dealloc_fn_)
   {}
 };
 
-// clang-format off
 struct _Resource_vtable_builder
 {
-    template <class _Resource, class _Property>
-    static __property_value_t<_Property> _Get_property(void* __res) noexcept {
-        return get_property(*static_cast<const _Resource *>(__res), _Property{});
-    }
+  template <_WrapperType _Wrapper_type>
+  using __wrapper_type = _CUDA_VSTD::integral_constant<_WrapperType, _Wrapper_type>;
 
-    template <class _Resource>
-    static void* _Alloc(void* __object, size_t __bytes, size_t __alignment) {
-        return static_cast<_Resource *>(__object)->allocate(__bytes, __alignment);
-    }
+  template <class _Resource, class _Property>
+  static __property_value_t<_Property> _Get_property(void* __res) noexcept
+  {
+    return get_property(*static_cast<const _Resource*>(__res), _Property{});
+  }
 
-    template <class _Resource>
-    static void _Dealloc(void* __object, void* __ptr, size_t __bytes, size_t __alignment) {
-        return static_cast<_Resource *>(__object)->deallocate(__ptr, __bytes, __alignment);
-    }
+  template <class _Resource>
+  static void* _Alloc(void* __object, size_t __bytes, size_t __alignment)
+  {
+    return static_cast<_Resource*>(__object)->allocate(__bytes, __alignment);
+  }
 
-    template <class _Resource>
-    static void* _Alloc_async(void* __object, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream) {
-        return static_cast<_Resource *>(__object)->allocate_async(__bytes, __alignment, __stream);
-    }
+  template <class _Resource>
+  static void _Dealloc(void* __object, void* __ptr, size_t __bytes, size_t __alignment) noexcept
+  {
+    // TODO: this breaks RMM because their memory resources do not declare their
+    // deallocate functions to be noexcept. Comment out the check for now until
+    // we can fix RMM.
+    // static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__ptr, __bytes, __alignment)));
+    return static_cast<_Resource*>(__object)->deallocate(__ptr, __bytes, __alignment);
+  }
 
-    template <class _Resource>
-    static void _Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream) {
-        return static_cast<_Resource *>(__object)->deallocate_async(__ptr, __bytes, __alignment, __stream);
-    }
+  template <class _Resource>
+  static void* _Alloc_async(void* __object, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
+  {
+    return static_cast<_Resource*>(__object)->allocate_async(__bytes, __alignment, __stream);
+  }
+
+  template <class _Resource>
+  static void
+  _Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
+  {
+    return static_cast<_Resource*>(__object)->deallocate_async(__ptr, __bytes, __alignment, __stream);
+  }
 
-    template <class _Resource>
-    static bool _Equal(void* __left, void* __rhs) {
-        return *static_cast<_Resource *>(__left) == *static_cast<_Resource *>(__rhs);
+  template <class _Resource>
+  static bool _Equal(void* __left, void* __rhs)
+  {
+    return *static_cast<_Resource*>(__left) == *static_cast<_Resource*>(__rhs);
+  }
+
+  template <class _Resource>
+  static void _Destroy_impl(_AnyResourceStorage* __object_, __wrapper_type<_WrapperType::_Owning>) noexcept
+  {
+    _Resource* __object = _Any_resource_cast<_Resource>(__object_);
+    _CCCL_IF_CONSTEXPR (_IsSmall<_Resource>())
+    {
+      __object->~_Resource();
+    }
+    else
+    {
+      delete __object;
     }
+  }
+
+  template <class _Resource>
+  static void _Destroy_impl(_AnyResourceStorage*, __wrapper_type<_WrapperType::_Reference>) noexcept
+  {}
 
-    _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type)
-      _LIBCUDACXX_REQUIRES((_Alloc_type == _AllocType::_Default))
-     static constexpr _Alloc_vtable _Create() noexcept
+  template <class _Resource, _WrapperType _Wrapper_type>
+  static void _Destroy(_AnyResourceStorage* __object) noexcept
+  {
+    _Destroy_impl<_Resource>(__object, __wrapper_type<_Wrapper_type>{});
+  }
+
+  template <class _Resource>
+  static void _Move_impl(
+    _AnyResourceStorage* __object, _AnyResourceStorage* __other_, __wrapper_type<_WrapperType::_Owning>) noexcept
+  {
+    _CCCL_IF_CONSTEXPR (_IsSmall<_Resource>())
     {
-      return {&_Resource_vtable_builder::_Alloc<_Resource>,
-              &_Resource_vtable_builder::_Dealloc<_Resource>,
-              &_Resource_vtable_builder::_Equal<_Resource>};
+      _Resource* __other = _Any_resource_cast<_Resource>(__other_);
+      ::new (static_cast<void*>(__object->__buf_)) _Resource(_CUDA_VSTD::move(*__other));
+      __other->~_Resource();
     }
+    else
+    {
+      __object->__ptr_ = _CUDA_VSTD::exchange(__other_->__ptr_, nullptr);
+    }
+  }
+
+  template <class _Resource>
+  static void _Move_impl(_AnyResourceStorage*, _AnyResourceStorage*, __wrapper_type<_WrapperType::_Reference>) noexcept
+  {}
+
+  template <class _Resource, _WrapperType _Wrapper_type>
+  static void _Move(_AnyResourceStorage* __object, _AnyResourceStorage* __other) noexcept
+  {
+    _Move_impl<_Resource>(__object, __other, __wrapper_type<_Wrapper_type>{});
+  }
 
-    _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type)
-      _LIBCUDACXX_REQUIRES((_Alloc_type == _AllocType::_Async))
-     static constexpr _Async_alloc_vtable _Create() noexcept
+  template <class _Resource>
+  static void _Copy_impl(
+    _AnyResourceStorage* __object, const _AnyResourceStorage* __other, __wrapper_type<_WrapperType::_Owning>) noexcept
+  {
+    _CCCL_IF_CONSTEXPR (_IsSmall<_Resource>())
     {
-      return {&_Resource_vtable_builder::_Alloc<_Resource>,
-              &_Resource_vtable_builder::_Dealloc<_Resource>,
-              &_Resource_vtable_builder::_Equal<_Resource>,
-              &_Resource_vtable_builder::_Alloc_async<_Resource>,
-              &_Resource_vtable_builder::_Dealloc_async<_Resource>};
+      ::new (static_cast<void*>(__object->__buf_)) _Resource(*_Any_resource_cast<_Resource>(__other));
     }
+    else
+    {
+      __object->__ptr_ = new _Resource(*_Any_resource_cast<_Resource>(__other));
+    }
+  }
+
+  template <class _Resource>
+  static void
+  _Copy_impl(_AnyResourceStorage*, const _AnyResourceStorage*, __wrapper_type<_WrapperType::_Reference>) noexcept
+  {}
+
+  template <class _Resource, _WrapperType _Wrapper_type>
+  static void _Copy(_AnyResourceStorage* __object, const _AnyResourceStorage* __other)
+  {
+    _Copy_impl<_Resource>(__object, __other, __wrapper_type<_Wrapper_type>{});
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type, _WrapperType _Wrapper_type)
+  _LIBCUDACXX_REQUIRES((_Alloc_type == _AllocType::_Default))
+  static constexpr _Alloc_vtable _Create() noexcept
+  {
+    return {_IsSmall<_Resource>(),
+            &_Resource_vtable_builder::_Alloc<_Resource>,
+            &_Resource_vtable_builder::_Dealloc<_Resource>,
+            &_Resource_vtable_builder::_Equal<_Resource>,
+            &_Resource_vtable_builder::_Destroy<_Resource, _Wrapper_type>,
+            &_Resource_vtable_builder::_Move<_Resource, _Wrapper_type>,
+            &_Resource_vtable_builder::_Copy<_Resource, _Wrapper_type>};
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type, _WrapperType _Wrapper_type)
+  _LIBCUDACXX_REQUIRES((_Alloc_type == _AllocType::_Async))
+  static constexpr _Async_alloc_vtable _Create() noexcept
+  {
+    return {_IsSmall<_Resource>(),
+            &_Resource_vtable_builder::_Alloc<_Resource>,
+            &_Resource_vtable_builder::_Dealloc<_Resource>,
+            &_Resource_vtable_builder::_Equal<_Resource>,
+            &_Resource_vtable_builder::_Destroy<_Resource, _Wrapper_type>,
+            &_Resource_vtable_builder::_Move<_Resource, _Wrapper_type>,
+            &_Resource_vtable_builder::_Copy<_Resource, _Wrapper_type>,
+            &_Resource_vtable_builder::_Alloc_async<_Resource>,
+            &_Resource_vtable_builder::_Dealloc_async<_Resource>};
+  }
 };
-// clang-format on
+
+template <class _Property>
+using __property_fn_t = __property_value_t<_Property> (*)(void*);
 
 template <class _Property>
 struct _Property_vtable
 {
-  using _PropertyFn         = __property_value_t<_Property> (*)(void*);
-  _PropertyFn __property_fn = nullptr;
+  __property_fn_t<_Property> __property_fn = nullptr;
 
-  constexpr _Property_vtable(_PropertyFn __property_fn_) noexcept
+  constexpr _Property_vtable(__property_fn_t<_Property> __property_fn_) noexcept
       : __property_fn(__property_fn_)
   {}
 };
@@ -151,14 +311,13 @@ class basic_resource_ref;
 template <class... _Properties>
 struct _Resource_vtable : public _Property_vtable<_Properties>...
 {
-  template <class... _PropertyFns>
-  constexpr _Resource_vtable(_PropertyFns... __property_fn_) noexcept
+  constexpr _Resource_vtable(__property_fn_t<_Properties>... __property_fn_) noexcept
       : _Property_vtable<_Properties>(__property_fn_)...
   {}
 
-  template <_AllocType _Alloc_type, class... _OtherProperties>
-  constexpr _Resource_vtable(basic_resource_ref<_Alloc_type, _OtherProperties...>& __ref) noexcept
-      : _Property_vtable<_Properties>(__ref._Property_vtable<_Properties>::__property_fn)...
+  template <class... _OtherProperties>
+  constexpr _Resource_vtable(const _Resource_vtable<_OtherProperties...>& __other) noexcept
+      : _Property_vtable<_Properties>(__other._Property_vtable<_Properties>::__property_fn)...
   {}
 
   template <class _Resource>
@@ -178,6 +337,7 @@ struct _Property_filter
   using _Filtered_properties =
     typename _Filtered<_Properties...>::_Filtered_vtable::template _Append_property<_Property>;
 };
+
 template <>
 struct _Property_filter<false>
 {
@@ -212,7 +372,10 @@ struct _Filtered<>
 template <class... _Properties>
 using _Filtered_vtable = typename _Filtered<_Properties...>::_Filtered_vtable::_Vtable;
 
-template <class _Vtable>
+template <_WrapperType _Wrapper_type>
+using __alloc_object_storage_t = _CUDA_VSTD::_If<_Wrapper_type == _WrapperType::_Reference, void*, _AnyResourceStorage>;
+
+template <class _Vtable, _WrapperType _Wrapper_type>
 struct _Alloc_base
 {
   static_assert(_CUDA_VSTD::is_base_of_v<_Alloc_vtable, _Vtable>, "");
@@ -224,66 +387,108 @@ struct _Alloc_base
 
   _CCCL_NODISCARD void* allocate(size_t __bytes, size_t __alignment = alignof(_CUDA_VSTD::max_align_t))
   {
-    return __static_vtable->__alloc_fn(__object, __bytes, __alignment);
+    return __static_vtable->__alloc_fn(_Get_object(), __bytes, __alignment);
   }
 
-  void deallocate(void* _Ptr, size_t __bytes, size_t __alignment = alignof(_CUDA_VSTD::max_align_t))
+  void deallocate(void* _Ptr, size_t __bytes, size_t __alignment = alignof(_CUDA_VSTD::max_align_t)) noexcept
   {
-    __static_vtable->__dealloc_fn(__object, _Ptr, __bytes, __alignment);
+    __static_vtable->__dealloc_fn(_Get_object(), _Ptr, __bytes, __alignment);
   }
 
 protected:
-  void* __object                 = nullptr;
+  static _CCCL_FORCEINLINE void* _Get_object_(bool, void* __object) noexcept
+  {
+    return __object;
+  }
+
+  static _CCCL_FORCEINLINE void* _Get_object_(const bool __is_small, const _AnyResourceStorage& __object) noexcept
+  {
+    const void* __pv = __is_small ? __object.__buf_ : __object.__ptr_;
+    return const_cast<void*>(__pv);
+  }
+
+  void* _Get_object() const noexcept
+  {
+    return _Get_object_(__static_vtable->__is_small, this->__object);
+  }
+
+  __alloc_object_storage_t<_Wrapper_type> __object{};
   const _Vtable* __static_vtable = nullptr;
 };
 
-template <class _Vtable>
-struct _Async_alloc_base : public _Alloc_base<_Vtable>
+template <class _Vtable, _WrapperType _Wrapper_type>
+struct _Async_alloc_base : public _Alloc_base<_Vtable, _Wrapper_type>
 {
   static_assert(_CUDA_VSTD::is_base_of_v<_Async_alloc_vtable, _Vtable>, "");
 
   _Async_alloc_base(void* __object_, const _Vtable* __static_vtabl_) noexcept
-      : _Alloc_base<_Vtable>(__object_, __static_vtabl_)
+      : _Alloc_base<_Vtable, _Wrapper_type>(__object_, __static_vtabl_)
   {}
 
   _CCCL_NODISCARD void* allocate_async(size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
   {
-    return this->__static_vtable->__async_alloc_fn(this->__object, __bytes, __alignment, __stream);
+    return this->__static_vtable->__async_alloc_fn(this->_Get_object(), __bytes, __alignment, __stream);
   }
 
   _CCCL_NODISCARD void* allocate_async(size_t __bytes, ::cuda::stream_ref __stream)
   {
-    return this->__static_vtable->__async_alloc_fn(this->__object, __bytes, alignof(max_align_t), __stream);
+    return this->__static_vtable->__async_alloc_fn(this->_Get_object(), __bytes, alignof(max_align_t), __stream);
   }
 
   void deallocate_async(void* _Ptr, size_t __bytes, ::cuda::stream_ref __stream)
   {
-    this->__static_vtable->__async_dealloc_fn(this->__object, _Ptr, __bytes, alignof(max_align_t), __stream);
+    this->__static_vtable->__async_dealloc_fn(this->_Get_object(), _Ptr, __bytes, alignof(max_align_t), __stream);
   }
 
   void deallocate_async(void* _Ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
   {
-    this->__static_vtable->__async_dealloc_fn(this->__object, _Ptr, __bytes, __alignment, __stream);
+    this->__static_vtable->__async_dealloc_fn(this->_Get_object(), _Ptr, __bytes, __alignment, __stream);
   }
 };
 
-template <_AllocType _Alloc_type>
-using _Resource_ref_base = _CUDA_VSTD::
-  _If<_Alloc_type == _AllocType::_Default, _Alloc_base<_Alloc_vtable>, _Async_alloc_base<_Async_alloc_vtable>>;
+template <class _VTable, _WrapperType _Wrapper_type>
+constexpr bool _Is_resource_base_fn(const _Alloc_base<_VTable, _Wrapper_type>*) noexcept
+{
+  return true;
+}
+
+constexpr bool _Is_resource_base_fn(...) noexcept
+{
+  return false;
+}
+
+template <class _Resource>
+_LIBCUDACXX_CONCEPT _Is_resource_base = _Is_resource_base_fn(static_cast<_Resource*>(nullptr));
+
+template <_AllocType _Alloc_type, _WrapperType _Wrapper_type>
+using _Resource_base =
+  _CUDA_VSTD::_If<_Alloc_type == _AllocType::_Default,
+                  _Alloc_base<_Alloc_vtable, _Wrapper_type>,
+                  _Async_alloc_base<_Async_alloc_vtable, _Wrapper_type>>;
 
 template <_AllocType _Alloc_type>
 using _Vtable_store = _CUDA_VSTD::_If<_Alloc_type == _AllocType::_Default, _Alloc_vtable, _Async_alloc_vtable>;
 
-template <_AllocType _Alloc_type, class _Resource>
+template <_AllocType _Alloc_type, _WrapperType _Wrapper_type, class _Resource>
 _LIBCUDACXX_INLINE_VAR constexpr _Vtable_store<_Alloc_type> __alloc_vtable =
-  _Resource_vtable_builder::template _Create<_Resource, _Alloc_type>();
+  _Resource_vtable_builder::template _Create<_Resource, _Alloc_type, _Wrapper_type>();
 
-template <class>
-_LIBCUDACXX_INLINE_VAR constexpr bool __is_basic_resource_ref = false;
+struct _Resource_ref_helper
+{
+  //! This is used from \c basic_any_resource to make it convertible to a \c basic_resource_ref
+  template <_AllocType _Alloc_type, class... _Properties>
+  static basic_resource_ref<_Alloc_type, _Properties...>
+  _Construct(void* __object,
+             const _Vtable_store<_Alloc_type>* __static_vtable,
+             _Filtered_vtable<_Properties...> __properties) noexcept
+  {
+    return basic_resource_ref<_Alloc_type, _Properties...>(__object, __static_vtable, __properties);
+  }
+};
 
 template <_AllocType _Alloc_type, class... _Properties>
 class basic_resource_ref
-    : public _Resource_ref_base<_Alloc_type>
+    : public _Resource_base<_Alloc_type, _WrapperType::_Reference>
     , private _Filtered_vtable<_Properties...>
 {
 private:
@@ -293,21 +498,32 @@ class basic_resource_ref
   template <class...>
   friend struct _Resource_vtable;
 
+  friend struct _Resource_ref_helper;
+
   using __vtable = _Filtered_vtable<_Properties...>;
 
   template <class... _OtherProperties>
   static constexpr bool __properties_match =
-    _CUDA_VSTD::__all_of<_CUDA_VSTD::_One_of<_Properties, _OtherProperties...>...>;
+    _CUDA_VSTD::__type_set_contains<_CUDA_VSTD::__make_type_set<_OtherProperties...>, _Properties...>;
+
+  //! @brief Constructs a \c basic_resource_ref from a void*, a resource vtable ptr, and a vtable
+  //! for the properties. This is used to create a \c basic_resource_ref from a \c basic_any_resource.
+  explicit basic_resource_ref(
+    void* __object_, const _Vtable_store<_Alloc_type>* __static_vtable, __vtable __properties) noexcept
+      : _Resource_base<_Alloc_type, _WrapperType::_Reference>(__object_, __static_vtable)
+      , __vtable(__properties)
+  {}
 
 public:
   //! @brief Constructs a \c basic_resource_ref from a type that satisfies the \c resource or \c async_resource concept
   //! as well as all properties
   //! @param __res The resource to be wrapped within the \c basic_resource_ref
   _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type2 = _Alloc_type)
-  _LIBCUDACXX_REQUIRES((!__is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Default)
+  _LIBCUDACXX_REQUIRES((!_Is_resource_base<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Default)
                          _LIBCUDACXX_AND resource_with<_Resource, _Properties...>)
   basic_resource_ref(_Resource& __res) noexcept
-      : _Resource_ref_base<_Alloc_type>(_CUDA_VSTD::addressof(__res), &__alloc_vtable<_Alloc_type, _Resource>)
+      : _Resource_base<_Alloc_type, _WrapperType::_Reference>(
+          _CUDA_VSTD::addressof(__res), &__alloc_vtable<_Alloc_type, _WrapperType::_Reference, _Resource>)
       , __vtable(__vtable::template _Create<_Resource>())
   {}
 
@@ -315,10 +531,11 @@ class basic_resource_ref
   //! properties. This ignores the async interface of the passed in resource
   //! @param __res The resource to be wrapped within the \c resource_ref
   _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type2 = _Alloc_type)
-  _LIBCUDACXX_REQUIRES((!__is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Async)
+  _LIBCUDACXX_REQUIRES((!_Is_resource_base<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Async)
                          _LIBCUDACXX_AND async_resource_with<_Resource, _Properties...>)
   basic_resource_ref(_Resource& __res) noexcept
-      : _Resource_ref_base<_Alloc_type>(_CUDA_VSTD::addressof(__res), &__alloc_vtable<_Alloc_type, _Resource>)
+      : _Resource_base<_Alloc_type, _WrapperType::_Reference>(
+          _CUDA_VSTD::addressof(__res), &__alloc_vtable<_Alloc_type, _WrapperType::_Reference, _Resource>)
       , __vtable(__vtable::template _Create<_Resource>())
   {}
 
@@ -326,10 +543,11 @@ class basic_resource_ref
   //! as well as all properties
   //! @param __res Pointer to a resource to be wrapped within the \c basic_resource_ref
   _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type2 = _Alloc_type)
-  _LIBCUDACXX_REQUIRES((!__is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Default)
+  _LIBCUDACXX_REQUIRES((!_Is_resource_base<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Default)
                          _LIBCUDACXX_AND resource_with<_Resource, _Properties...>)
   basic_resource_ref(_Resource* __res) noexcept
-      : _Resource_ref_base<_Alloc_type>(__res, &__alloc_vtable<_Alloc_type, _Resource>)
+      : _Resource_base<_Alloc_type, _WrapperType::_Reference>(
+          __res, &__alloc_vtable<_Alloc_type, _WrapperType::_Reference, _Resource>)
       , __vtable(__vtable::template _Create<_Resource>())
   {}
 
@@ -337,10 +555,11 @@ class basic_resource_ref
   //! properties. This ignores the async interface of the passed in resource
   //! @param __res Pointer to a resource to be wrapped within the \c resource_ref
   _LIBCUDACXX_TEMPLATE(class _Resource, _AllocType _Alloc_type2 = _Alloc_type)
-  _LIBCUDACXX_REQUIRES((!__is_basic_resource_ref<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Async)
+  _LIBCUDACXX_REQUIRES((!_Is_resource_base<_Resource>) _LIBCUDACXX_AND(_Alloc_type2 == _AllocType::_Async)
                          _LIBCUDACXX_AND async_resource_with<_Resource, _Properties...>)
   basic_resource_ref(_Resource* __res) noexcept
-      : _Resource_ref_base<_Alloc_type>(__res, &__alloc_vtable<_Alloc_type, _Resource>)
+      : _Resource_base<_Alloc_type, _WrapperType::_Reference>(
+          __res, &__alloc_vtable<_Alloc_type, _WrapperType::_Reference, _Resource>)
       , __vtable(__vtable::template _Create<_Resource>())
   {}
 
@@ -349,8 +568,8 @@ class basic_resource_ref
   _LIBCUDACXX_TEMPLATE(class... _OtherProperties)
   _LIBCUDACXX_REQUIRES(__properties_match<_OtherProperties...>)
   basic_resource_ref(basic_resource_ref<_Alloc_type, _OtherProperties...> __ref) noexcept
-      : _Resource_ref_base<_Alloc_type>(__ref.__object, __ref.__static_vtable)
-      , __vtable(__ref)
+      : _Resource_base<_Alloc_type, _WrapperType::_Reference>(__ref.__object, __ref.__static_vtable)
+      , __vtable(static_cast<const _Filtered_vtable<_OtherProperties...>&>(__ref))
   {}
 
   //! @brief Conversion from a \c async_resource_ref with the same set of properties but in a different order to a
@@ -360,8 +579,8 @@ class basic_resource_ref
   _LIBCUDACXX_REQUIRES((_OtherAllocType == _AllocType::_Async) _LIBCUDACXX_AND(_OtherAllocType != _Alloc_type)
                          _LIBCUDACXX_AND __properties_match<_OtherProperties...>)
   basic_resource_ref(basic_resource_ref<_OtherAllocType, _OtherProperties...> __ref) noexcept
-      : _Resource_ref_base<_Alloc_type>(__ref.__object, __ref.__static_vtable)
-      , __vtable(__ref)
+      : _Resource_base<_Alloc_type, _WrapperType::_Reference>(__ref.__object, __ref.__static_vtable)
+      , __vtable(static_cast<const _Filtered_vtable<_OtherProperties...>&>(__ref))
   {}
 
   //! @brief Equality comparison between two \c basic_resource_ref
@@ -373,6 +592,7 @@ class basic_resource_ref
                          _LIBCUDACXX_AND __properties_match<_OtherProperties...>)
   _CCCL_NODISCARD bool operator==(const basic_resource_ref<_Alloc_type, _OtherProperties...>& __rhs) const
   {
+    // BUGBUG: comparing function pointers like this can lead to false negatives:
     return (this->__static_vtable->__equal_fn == __rhs.__static_vtable->__equal_fn)
         && this->__static_vtable->__equal_fn(this->__object, __rhs.__object);
   }
@@ -403,10 +623,6 @@ class basic_resource_ref
   }
 };
 
-//! @brief Checks whether a passed in type is a specialization of basic_resource_ref
-template <_AllocType _Alloc_type, class... _Properties>
-_LIBCUDACXX_INLINE_VAR constexpr bool __is_basic_resource_ref<basic_resource_ref<_Alloc_type, _Properties...>> = true;
-
 //! @brief Type erased wrapper around a `resource` that satisfies \tparam _Properties
 //! @tparam _Properties The properties that any resource wrapped within the `resource_ref` needs to satisfy
 template <class... _Properties>
diff --git a/libcudacxx/include/cuda/annotated_ptr b/libcudacxx/include/cuda/annotated_ptr
index f5e04e5662..a04be90c1f 100644
--- a/libcudacxx/include/cuda/annotated_ptr
+++ b/libcudacxx/include/cuda/annotated_ptr
@@ -187,8 +187,8 @@ public:
   _CCCL_HOST_DEVICE constexpr access_property() noexcept
       : __descriptor(__detail_ap::__sm_80::__interleave_normal())
   {}
-  constexpr access_property(access_property const&) noexcept        = default;
-  access_property& operator=(const access_property& other) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr access_property(access_property const&) noexcept        = default;
+  _CCCL_HIDE_FROM_ABI access_property& operator=(const access_property& other) noexcept = default;
 
   _CCCL_HOST_DEVICE constexpr access_property(normal, float __fraction)
       : __descriptor(__detail_ap::__interleave(normal{}, __fraction))
@@ -351,10 +351,10 @@ public:
     return __repr - o.__repr;
   }
 
-  constexpr annotated_ptr() noexcept                     = default;
-  constexpr annotated_ptr(annotated_ptr const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr annotated_ptr() noexcept                     = default;
+  _CCCL_HIDE_FROM_ABI constexpr annotated_ptr(annotated_ptr const&) noexcept = default;
   // No constexpr for c11 as the method can't be const
-  _CCCL_CONSTEXPR_CXX14 annotated_ptr& operator=(annotated_ptr const& other) noexcept = default;
+  _CCCL_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 annotated_ptr& operator=(annotated_ptr const& other) noexcept = default;
 
   _CCCL_HOST_DEVICE explicit annotated_ptr(pointer __p)
       : __repr(__p)
diff --git a/libcudacxx/include/cuda/cmath b/libcudacxx/include/cuda/cmath
index 6973425304..0b613ef83f 100644
--- a/libcudacxx/include/cuda/cmath
+++ b/libcudacxx/include/cuda/cmath
@@ -29,7 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 template <class _Tp, _CUDA_VSTD::__enable_if_t<_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp), int> = 0>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Tp __b) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp ceil_div(const _Tp __a, const _Tp __b) noexcept
 {
   _LIBCUDACXX_DEBUG_ASSERT(__a >= _Tp(0), "cuda::ceil_div: a must be non negative");
   _LIBCUDACXX_DEBUG_ASSERT(__b > _Tp(0), "cuda::ceil_div: b must be positive");
diff --git a/libcudacxx/include/cuda/functional b/libcudacxx/include/cuda/functional
index d88472f50a..3a433f90a8 100644
--- a/libcudacxx/include/cuda/functional
+++ b/libcudacxx/include/cuda/functional
@@ -1,134 +1,12 @@
-// -*- C++ -*-
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- *
- * NVIDIA SOFTWARE LICENSE
- *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
- * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
- *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
- * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
- * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
- * this license, and you take legal and financial responsibility for the actions of your permitted users.
- *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
- * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
- *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
- * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
- * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
- * this license.
- *
- * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
- * including (without limitation) terms relating to the license grant and license restrictions and protection of
- * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
- * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
- * of your agreements with respect to distributed SOFTWARE.
- *
- * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
- * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
- * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
- * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
- * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
- * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
- * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
- * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
- * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
- * application where the use or failure of the system or application can reasonably be expected to threaten or result in
- * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
- * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
- * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
- * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
- * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
- * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
- * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
- * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
- *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
- * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
- * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
- * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
- * production or business-critical systems.
- *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
- * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
- * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
- * without notice, but is not obligated to support or update the SOFTWARE.
- *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
- * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
- * a conflict between the terms in this license and the license terms associated with a component, the license terms
- * associated with the components control only to the extent necessary to resolve the conflict.
- *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
- * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
- * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
- * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
- * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
- * will use Feedback at its choice.
- *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
- * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
- * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
- * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
- *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
- * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
- * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
- * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
- * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
- * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
- * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
- * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
- * OR EXTEND THIS LIMIT.
- *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
- * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
- * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
- * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
- * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
- * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
- * license are not affected by the termination of this license. All provisions of this license will survive termination,
- * except for the license granted to you.
- *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
- * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
- * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
- * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
- * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
- * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
- * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
- *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
- * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
- * void and of no effect.
- *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
- * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
- * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
- * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
- * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
- * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
- * receiving the SOFTWARE.
- *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
- * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
- * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
- * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
- * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
- * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
- *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
- * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
- * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
- * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
- * This license may only be modified in a writing signed by an authorized representative of each party.
- *
- * (v. August 20, 2021)
- */
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef _CUDA_FUNCTIONAL_
 #define _CUDA_FUNCTIONAL_
@@ -143,84 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__algorithm/max.h>
-#include <cuda/std/__algorithm/min.h>
+#include <cuda/__functional/proclaim_return_type.h>
 #include <cuda/std/functional>
-#include <cuda/std/type_traits>
-#include <cuda/std/utility>
-
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
-namespace __detail
-{
-
-template <class _Ret, class _DecayFn>
-class __return_type_wrapper
-{
-private:
-  _DecayFn __fn_;
-
-public:
-  __return_type_wrapper() = delete;
-
-  template <class _Fn,
-            class = _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::is_same<_CUDA_VSTD::__decay_t<_Fn>, _DecayFn>::value>>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit __return_type_wrapper(_Fn&& __fn) noexcept
-      : __fn_(_CUDA_VSTD::forward<_Fn>(__fn))
-  {}
-
-  template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) & noexcept
-  {
-#if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<_DecayFn&, _As...>::type>::value,
-                  "Return type shall match the proclaimed one exactly");
-#endif
-
-    return _CUDA_VSTD::__invoke(__fn_, _CUDA_VSTD::forward<_As>(__as)...);
-  }
-
-  template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) && noexcept
-  {
-#if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<_DecayFn, _As...>::type>::value,
-                  "Return type shall match the proclaimed one exactly");
-#endif
-
-    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_), _CUDA_VSTD::forward<_As>(__as)...);
-  }
-
-  template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) const& noexcept
-  {
-#if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<const _DecayFn&, _As...>::type>::value,
-                  "Return type shall match the proclaimed one exactly");
-#endif
-
-    return _CUDA_VSTD::__invoke(__fn_, _CUDA_VSTD::forward<_As>(__as)...);
-  }
-
-  template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) const&& noexcept
-  {
-#if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<const _DecayFn, _As...>::type>::value,
-                  "Return type shall match the proclaimed one exactly");
-#endif
-
-    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_), _CUDA_VSTD::forward<_As>(__as)...);
-  }
-};
-
-} // namespace __detail
-
-template <class _Ret, class _Fn>
-inline _LIBCUDACXX_INLINE_VISIBILITY __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>
-proclaim_return_type(_Fn&& __fn) noexcept
-{
-  return __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>(_CUDA_VSTD::forward<_Fn>(__fn));
-}
-_LIBCUDACXX_END_NAMESPACE_CUDA
 
 #endif // _CUDA_FUNCTIONAL_
diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource
index d3c1ae1f91..d2e4296b74 100644
--- a/libcudacxx/include/cuda/memory_resource
+++ b/libcudacxx/include/cuda/memory_resource
@@ -21,7 +21,8 @@
 #endif // no system header
 
 //!@rst
-//! @file Defines facilities to allocate and deallocate memory in a type safe manner
+//! @file
+//! Defines facilities to allocate and deallocate memory in a type safe manner
 //!
 //! .. note::
 //!
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index 583a6fb6c7..564075e182 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -165,7 +165,7 @@ template <thread_scope _Scope, uint8_t _Stages_count>
 class pipeline_shared_state
 {
 public:
-  pipeline_shared_state()                                        = default;
+  _CCCL_HIDE_FROM_ABI pipeline_shared_state()                    = default;
   pipeline_shared_state(const pipeline_shared_state&)            = delete;
   pipeline_shared_state(pipeline_shared_state&&)                 = delete;
   pipeline_shared_state& operator=(pipeline_shared_state&&)      = delete;
@@ -179,17 +179,17 @@ private:
   friend class pipeline;
 
   template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  friend _LIBCUDACXX_HIDE_FROM_ABI pipeline<_Pipeline_scope>
   make_pipeline(const _Group& __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state);
 
   template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  friend _LIBCUDACXX_HIDE_FROM_ABI pipeline<_Pipeline_scope>
   make_pipeline(const _Group& __group,
                 pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
                 size_t __producer_count);
 
   template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  friend _LIBCUDACXX_HIDE_FROM_ABI pipeline<_Pipeline_scope>
   make_pipeline(const _Group& __group,
                 pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
                 pipeline_role __role);
@@ -212,12 +212,12 @@ template <thread_scope _Scope>
 class pipeline
 {
 public:
-  pipeline(pipeline&&)                 = default;
-  pipeline(const pipeline&)            = delete;
-  pipeline& operator=(pipeline&&)      = delete;
-  pipeline& operator=(const pipeline&) = delete;
+  _CCCL_HIDE_FROM_ABI pipeline(pipeline&&) = default;
+  pipeline(const pipeline&)                = delete;
+  pipeline& operator=(pipeline&&)          = delete;
+  pipeline& operator=(const pipeline&)     = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY ~pipeline()
+  _LIBCUDACXX_HIDE_FROM_ABI ~pipeline()
   {
     if (__active)
     {
@@ -225,7 +225,7 @@ public:
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool quit()
+  _LIBCUDACXX_HIDE_FROM_ABI bool quit()
   {
     bool __elected;
     uint32_t __sub_count;
@@ -257,13 +257,13 @@ public:
     return __released;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void producer_acquire()
+  _LIBCUDACXX_HIDE_FROM_ABI void producer_acquire()
   {
     barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__head)->__consumed;
     __stage_barrier.wait_parity(__consumed_phase_parity);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void producer_commit()
+  _LIBCUDACXX_HIDE_FROM_ABI void producer_commit()
   {
     barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__head)->__produced;
     (void) __memcpy_completion_impl::__defer(
@@ -276,13 +276,13 @@ public:
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void consumer_wait()
+  _LIBCUDACXX_HIDE_FROM_ABI void consumer_wait()
   {
     barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__tail)->__produced;
     __stage_barrier.wait_parity(__produced_phase_parity);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void consumer_release()
+  _LIBCUDACXX_HIDE_FROM_ABI void consumer_release()
   {
     (void) __shared_state_get_stage(__tail)->__consumed.arrive();
     if (++__tail == __stages_count)
@@ -293,7 +293,7 @@ public:
   }
 
   template <class _Rep, class _Period>
-  _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration)
+  _LIBCUDACXX_HIDE_FROM_ABI bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration)
   {
     barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__tail)->__produced;
     return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
@@ -302,7 +302,7 @@ public:
   }
 
   template <class _Clock, class _Duration>
-  _LIBCUDACXX_INLINE_VISIBILITY bool
+  _LIBCUDACXX_HIDE_FROM_ABI bool
   consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time_point)
   {
     return consumer_wait_for(__time_point - _Clock::now());
@@ -319,7 +319,7 @@ private:
   const bool __partitioned : 1;
   char* const __shared_state;
 
-  _LIBCUDACXX_INLINE_VISIBILITY pipeline(char* __shared_state, uint8_t __stages_count, bool __partitioned)
+  _LIBCUDACXX_HIDE_FROM_ABI pipeline(char* __shared_state, uint8_t __stages_count, bool __partitioned)
       : __head(0)
       , __tail(0)
       , __stages_count(__stages_count)
@@ -330,37 +330,37 @@ private:
       , __shared_state(__shared_state)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY __pipeline_stage<_Scope>* __shared_state_get_stage(uint8_t __stage)
+  _LIBCUDACXX_HIDE_FROM_ABI __pipeline_stage<_Scope>* __shared_state_get_stage(uint8_t __stage)
   {
     ptrdiff_t __stage_offset = __stage * sizeof(__pipeline_stage<_Scope>);
     return reinterpret_cast<__pipeline_stage<_Scope>*>(__shared_state + __stage_offset);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY atomic<uint32_t, _Scope>* __shared_state_get_refcount()
+  _LIBCUDACXX_HIDE_FROM_ABI atomic<uint32_t, _Scope>* __shared_state_get_refcount()
   {
     ptrdiff_t __refcount_offset = __stages_count * sizeof(__pipeline_stage<_Scope>);
     return reinterpret_cast<atomic<uint32_t, _Scope>*>(__shared_state + __refcount_offset);
   }
 
   template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  friend _LIBCUDACXX_HIDE_FROM_ABI pipeline<_Pipeline_scope>
   make_pipeline(const _Group& __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state);
 
   template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  friend _LIBCUDACXX_HIDE_FROM_ABI pipeline<_Pipeline_scope>
   make_pipeline(const _Group& __group,
                 pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
                 size_t __producer_count);
 
   template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  friend _LIBCUDACXX_HIDE_FROM_ABI pipeline<_Pipeline_scope>
   make_pipeline(const _Group& __group,
                 pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
                 pipeline_role __role);
 };
 
 template <class _Group, thread_scope _Scope, uint8_t _Stages_count>
-_LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope>
+_LIBCUDACXX_HIDE_FROM_ABI pipeline<_Scope>
 make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count>* __shared_state)
 {
   const uint32_t __group_size  = static_cast<uint32_t>(__group.size());
@@ -381,7 +381,7 @@ make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count
 }
 
 template <class _Group, thread_scope _Scope, uint8_t _Stages_count>
-_LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope> make_pipeline(
+_LIBCUDACXX_HIDE_FROM_ABI pipeline<_Scope> make_pipeline(
   const _Group& __group, pipeline_shared_state<_Scope, _Stages_count>* __shared_state, size_t __producer_count)
 {
   const uint32_t __group_size  = static_cast<uint32_t>(__group.size());
@@ -403,7 +403,7 @@ _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope> make_pipeline(
 }
 
 template <class _Group, thread_scope _Scope, uint8_t _Stages_count>
-_LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope>
+_LIBCUDACXX_HIDE_FROM_ABI pipeline<_Scope>
 make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count>* __shared_state, pipeline_role __role)
 {
   const uint32_t __group_size  = static_cast<uint32_t>(__group.size());
@@ -468,26 +468,26 @@ template <>
 class pipeline<thread_scope_thread>
 {
 public:
-  pipeline(pipeline&&)                 = default;
-  pipeline(const pipeline&)            = delete;
-  pipeline& operator=(pipeline&&)      = delete;
-  pipeline& operator=(const pipeline&) = delete;
+  _CCCL_HIDE_FROM_ABI pipeline(pipeline&&) = default;
+  pipeline(const pipeline&)                = delete;
+  pipeline& operator=(pipeline&&)          = delete;
+  pipeline& operator=(const pipeline&)     = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY ~pipeline() {}
+  _LIBCUDACXX_HIDE_FROM_ABI ~pipeline() {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool quit()
+  _LIBCUDACXX_HIDE_FROM_ABI bool quit()
   {
     return true;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void producer_acquire() {}
+  _LIBCUDACXX_HIDE_FROM_ABI void producer_acquire() {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY void producer_commit()
+  _LIBCUDACXX_HIDE_FROM_ABI void producer_commit()
   {
     NV_IF_TARGET(NV_PROVIDES_SM_80, asm volatile("cp.async.commit_group;"); ++__head;)
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void consumer_wait()
+  _LIBCUDACXX_HIDE_FROM_ABI void consumer_wait()
   {
     NV_IF_TARGET(
       NV_PROVIDES_SM_80,
@@ -498,10 +498,10 @@ public:
       ++__tail;)
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void consumer_release() {}
+  _LIBCUDACXX_HIDE_FROM_ABI void consumer_release() {}
 
   template <class _Rep, class _Period>
-  _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration)
+  _LIBCUDACXX_HIDE_FROM_ABI bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration)
   {
     (void) __duration;
     consumer_wait();
@@ -509,7 +509,7 @@ public:
   }
 
   template <class _Clock, class _Duration>
-  _LIBCUDACXX_INLINE_VISIBILITY bool
+  _LIBCUDACXX_HIDE_FROM_ABI bool
   consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time_point)
   {
     (void) __time_point;
@@ -521,18 +521,18 @@ private:
   uint8_t __head;
   uint8_t __tail;
 
-  _LIBCUDACXX_INLINE_VISIBILITY pipeline()
+  _LIBCUDACXX_HIDE_FROM_ABI pipeline()
       : __head(0)
       , __tail(0)
   {}
 
-  friend _LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline();
+  friend _LIBCUDACXX_HIDE_FROM_ABI pipeline<thread_scope_thread> make_pipeline();
 
   template <uint8_t _Prior>
-  friend _LIBCUDACXX_INLINE_VISIBILITY void pipeline_consumer_wait_prior(pipeline<thread_scope_thread>& __pipeline);
+  friend _LIBCUDACXX_HIDE_FROM_ABI void pipeline_consumer_wait_prior(pipeline<thread_scope_thread>& __pipeline);
 
   template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope> __make_pipeline(
+  friend _LIBCUDACXX_HIDE_FROM_ABI pipeline<_Pipeline_scope> __make_pipeline(
     const _Group& __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state);
 };
 
@@ -589,20 +589,20 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-_LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline()
+_LIBCUDACXX_HIDE_FROM_ABI pipeline<thread_scope_thread> make_pipeline()
 {
   return pipeline<thread_scope_thread>();
 }
 
 template <uint8_t _Prior>
-_LIBCUDACXX_INLINE_VISIBILITY void pipeline_consumer_wait_prior(pipeline<thread_scope_thread>& __pipeline)
+_LIBCUDACXX_HIDE_FROM_ABI void pipeline_consumer_wait_prior(pipeline<thread_scope_thread>& __pipeline)
 {
   NV_IF_TARGET(NV_PROVIDES_SM_80, device::__pipeline_consumer_wait<_Prior>(__pipeline);
                __pipeline.__tail = __pipeline.__head - _Prior;)
 }
 
 template <thread_scope _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 pipeline_producer_commit(pipeline<thread_scope_thread>& __pipeline, barrier<_Scope>& __barrier)
 {
   (void) __pipeline;
@@ -612,7 +612,7 @@ pipeline_producer_commit(pipeline<thread_scope_thread>& __pipeline, barrier<_Sco
 }
 
 template <typename _Group, class _Tp, typename _Size, thread_scope _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment __memcpy_async_pipeline(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_pipeline(
   _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, pipeline<_Scope>& __pipeline)
 {
   // 1. Set the completion mechanisms that can be used.
@@ -639,7 +639,7 @@ _LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment __memcpy_async_pipeline
 }
 
 template <typename _Group, class _Type, thread_scope _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group, _Type* __destination, _Type const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
 {
   return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
@@ -650,7 +650,7 @@ template <typename _Group,
           std::size_t _Alignment,
           thread_scope _Scope,
           std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group,
   _Type* __destination,
   _Type const* __source,
@@ -661,14 +661,14 @@ _LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
 }
 
 template <class _Type, typename _Size, thread_scope _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(_Type* __destination, _Type const* __source, _Size __size, pipeline<_Scope>& __pipeline)
 {
   return __memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline);
 }
 
 template <typename _Group, thread_scope _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group, void* __destination, void const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
 {
   return __memcpy_async_pipeline(
@@ -676,7 +676,7 @@ _LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
 }
 
 template <typename _Group, std::size_t _Alignment, thread_scope _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group,
   void* __destination,
   void const* __source,
@@ -688,7 +688,7 @@ _LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
 }
 
 template <typename _Size, thread_scope _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(void* __destination, void const* __source, _Size __size, pipeline<_Scope>& __pipeline)
 {
   return __memcpy_async_pipeline(
diff --git a/libcudacxx/include/cuda/std/__algorithm/adjacent_find.h b/libcudacxx/include/cuda/std/__algorithm/adjacent_find.h
index 8edf58d19c..696d276dc9 100644
--- a/libcudacxx/include/cuda/std/__algorithm/adjacent_find.h
+++ b/libcudacxx/include/cuda/std/__algorithm/adjacent_find.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 adjacent_find(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred)
 {
   if (__first != __last)
@@ -44,7 +44,7 @@ adjacent_find(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicat
 }
 
 template <class _ForwardIterator>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 adjacent_find(_ForwardIterator __first, _ForwardIterator __last)
 {
   return _CUDA_VSTD::adjacent_find(__first, __last, __equal_to{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/all_of.h b/libcudacxx/include/cuda/std/__algorithm/all_of.h
index 30a3296798..cb8e2cbb52 100644
--- a/libcudacxx/include/cuda/std/__algorithm/all_of.h
+++ b/libcudacxx/include/cuda/std/__algorithm/all_of.h
@@ -22,7 +22,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Predicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 all_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/any_of.h b/libcudacxx/include/cuda/std/__algorithm/any_of.h
index d45662d7e8..112dc509a8 100644
--- a/libcudacxx/include/cuda/std/__algorithm/any_of.h
+++ b/libcudacxx/include/cuda/std/__algorithm/any_of.h
@@ -22,7 +22,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Predicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 any_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/binary_search.h b/libcudacxx/include/cuda/std/__algorithm/binary_search.h
index ac486743ac..d494f66233 100644
--- a/libcudacxx/include/cuda/std/__algorithm/binary_search.h
+++ b/libcudacxx/include/cuda/std/__algorithm/binary_search.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp)
 {
   __first = _CUDA_VSTD::lower_bound<_ForwardIterator, _Tp, __comp_ref_type<_Compare>>(__first, __last, __value, __comp);
@@ -36,7 +36,7 @@ binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __va
 }
 
 template <class _ForwardIterator, class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value)
 {
   return _CUDA_VSTD::binary_search(__first, __last, __value, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/clamp.h b/libcudacxx/include/cuda/std/__algorithm/clamp.h
index d2c6540e19..357fde9c22 100644
--- a/libcudacxx/include/cuda/std/__algorithm/clamp.h
+++ b/libcudacxx/include/cuda/std/__algorithm/clamp.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp&
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&
 clamp(const _Tp& __v, const _Tp& __lo, const _Tp& __hi, _Compare __comp)
 {
   _LIBCUDACXX_ASSERT(!__comp(__hi, __lo), "Bad bounds passed to std::clamp");
@@ -34,7 +34,7 @@ clamp(const _Tp& __v, const _Tp& __lo, const _Tp& __hi, _Compare __comp)
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp&
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&
 clamp(const _Tp& __v, const _Tp& __lo, const _Tp& __hi)
 {
   return _CUDA_VSTD::clamp(__v, __lo, __hi, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/comp.h b/libcudacxx/include/cuda/std/__algorithm/comp.h
index 2e5c81ed45..fcc9c1a49a 100644
--- a/libcudacxx/include/cuda/std/__algorithm/comp.h
+++ b/libcudacxx/include/cuda/std/__algorithm/comp.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 struct __equal_to
 {
   template <class _T1, class _T2>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
   operator()(const _T1& __lhs, const _T2& __rhs) const noexcept(noexcept(__lhs == __rhs))
   {
     return __lhs == __rhs;
@@ -46,8 +46,8 @@ struct __is_trivial_equality_predicate<__equal_to, _Lhs, _Rhs> : true_type
 struct __less
 {
   template <class _Tp, class _Up>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator()(const _Tp& __lhs, const _Up& __rhs) const noexcept(noexcept(__lhs < __rhs))
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _Tp& __lhs, const _Up& __rhs) const
+    noexcept(noexcept(__lhs < __rhs))
   {
     return __lhs < __rhs;
   }
diff --git a/libcudacxx/include/cuda/std/__algorithm/comp_ref_type.h b/libcudacxx/include/cuda/std/__algorithm/comp_ref_type.h
index 954ba70179..75575972a3 100644
--- a/libcudacxx/include/cuda/std/__algorithm/comp_ref_type.h
+++ b/libcudacxx/include/cuda/std/__algorithm/comp_ref_type.h
@@ -29,13 +29,12 @@ template <class _Compare>
 struct __debug_less
 {
   _Compare& __comp_;
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __debug_less(_Compare& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __debug_less(_Compare& __c)
       : __comp_(__c)
   {}
 
   template <class _Tp, class _Up>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-  operator()(const _Tp& __x, const _Up& __y)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y)
   {
     bool __r = __comp_(__x, __y);
     if (__r)
@@ -46,8 +45,7 @@ struct __debug_less
   }
 
   template <class _Tp, class _Up>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-  operator()(_Tp& __x, _Up& __y)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(_Tp& __x, _Up& __y)
   {
     bool __r = __comp_(__x, __y);
     if (__r)
@@ -58,7 +56,7 @@ struct __debug_less
   }
 
   template <class _LHS, class _RHS>
-  inline _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 decltype((void) declval<_Compare&>()(declval<_LHS&>(), declval<_RHS&>()))
   __do_compare_assert(int, _LHS& __l, _RHS& __r)
   {
@@ -68,7 +66,7 @@ struct __debug_less
   }
 
   template <class _LHS, class _RHS>
-  inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void __do_compare_assert(long, _LHS&, _RHS&)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __do_compare_assert(long, _LHS&, _RHS&)
   {}
 };
 
diff --git a/libcudacxx/include/cuda/std/__algorithm/copy.h b/libcudacxx/include/cuda/std/__algorithm/copy.h
index 883cbc4632..2b76d701ff 100644
--- a/libcudacxx/include/cuda/std/__algorithm/copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/copy.h
@@ -27,15 +27,14 @@
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__type_traits/remove_const.h>
-#include <cuda/std/detail/libcxx/include/cstdint>
-#include <cuda/std/detail/libcxx/include/cstdlib>
+#include <cuda/std/cstdint>
+#include <cuda/std/cstdlib>
 #include <cuda/std/detail/libcxx/include/cstring>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _InputIterator, class _OutputIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 pair<_InputIterator, _OutputIterator>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_InputIterator, _OutputIterator>
 __copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
   for (; __first != __last; ++__first, (void) ++__result)
@@ -46,8 +45,7 @@ __copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 }
 
 template <class _Tp, class _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-__dispatch_memmove(_Up* __result, _Tp* __first, const size_t __n)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __dispatch_memmove(_Up* __result, _Tp* __first, const size_t __n)
 {
   // This is a pessimisation, but there's no way to do the code path detection correctly before GCC 9.0.
   // __builtin_memmove is also illegal in constexpr there, so... just always assume we are constant evaluated,
@@ -75,7 +73,7 @@ __dispatch_memmove(_Up* __result, _Tp* __first, const size_t __n)
 }
 
 template <class _Tp, class _Up>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 __constexpr_tail_overlap_fallback(_Tp* __first, _Up* __needle, _Tp* __last)
 {
   while (__first != __last)
@@ -90,8 +88,7 @@ __constexpr_tail_overlap_fallback(_Tp* __first, _Up* __needle, _Tp* __last)
 }
 
 template <class _Tp, class _Up>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-__constexpr_tail_overlap(_Tp* __first, _Up* __needle, _Tp* __last)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __constexpr_tail_overlap(_Tp* __first, _Up* __needle, _Tp* __last)
 {
   _LIBCUDACXX_UNUSED_VAR(__last);
 #if __has_builtin(__builtin_constant_p) || defined(_CCCL_COMPILER_GCC)
@@ -108,8 +105,7 @@ template <class _AlgPolicy,
           class _Up,
           __enable_if_t<_CCCL_TRAIT(is_same, __remove_const_t<_Tp>, _Up), int> = 0,
           __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _Up), int>          = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_Tp*, _Up*>
-__copy(_Tp* __first, _Tp* __last, _Up* __result)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Tp*, _Up*> __copy(_Tp* __first, _Tp* __last, _Up* __result)
 {
   const ptrdiff_t __n = __last - __first;
   if (__n > 0)
@@ -138,7 +134,7 @@ __copy(_Tp* __first, _Tp* __last, _Up* __result)
 }
 
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
   return _CUDA_VSTD::__copy<_ClassicAlgPolicy>(__unwrap_iter(__first), __unwrap_iter(__last), __unwrap_iter(__result))
diff --git a/libcudacxx/include/cuda/std/__algorithm/copy_backward.h b/libcudacxx/include/cuda/std/__algorithm/copy_backward.h
index 7619294f76..5c9845e879 100644
--- a/libcudacxx/include/cuda/std/__algorithm/copy_backward.h
+++ b/libcudacxx/include/cuda/std/__algorithm/copy_backward.h
@@ -30,7 +30,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _BidirectionalIterator, class _OutputIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 __copy_backward(_BidirectionalIterator __first, _BidirectionalIterator __last, _OutputIterator __result)
 {
   while (__first != __last)
@@ -44,8 +44,7 @@ template <class _Tp,
           class _Up,
           __enable_if_t<_CCCL_TRAIT(is_same, __remove_const_t<_Tp>, _Up), int> = 0,
           __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _Up), int>          = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _Up*
-__copy_backward(_Tp* __first, _Tp* __last, _Up* __result)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Up* __copy_backward(_Tp* __first, _Tp* __last, _Up* __result)
 {
   const ptrdiff_t __n = __last - __first;
   if (__n > 0)
@@ -63,7 +62,7 @@ __copy_backward(_Tp* __first, _Tp* __last, _Up* __result)
 }
 
 template <class _BidirectionalIterator1, class _BidirectionalIterator2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _BidirectionalIterator2
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _BidirectionalIterator2
 copy_backward(_BidirectionalIterator1 __first, _BidirectionalIterator1 __last, _BidirectionalIterator2 __result)
 {
   return _CUDA_VSTD::__copy_backward(__unwrap_iter(__first), __unwrap_iter(__last), __unwrap_iter(__result));
diff --git a/libcudacxx/include/cuda/std/__algorithm/copy_if.h b/libcudacxx/include/cuda/std/__algorithm/copy_if.h
index ef58081de5..56daf34319 100644
--- a/libcudacxx/include/cuda/std/__algorithm/copy_if.h
+++ b/libcudacxx/include/cuda/std/__algorithm/copy_if.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _Predicate>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 copy_if(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _Predicate __pred)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/copy_n.h b/libcudacxx/include/cuda/std/__algorithm/copy_n.h
index eb9e28873d..7fc592fad2 100644
--- a/libcudacxx/include/cuda/std/__algorithm/copy_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/copy_n.h
@@ -32,7 +32,7 @@ template <class _InputIterator,
           class _OutputIterator,
           __enable_if_t<__is_cpp17_input_iterator<_InputIterator>::value, int>          = 0,
           __enable_if_t<!__is_cpp17_random_access_iterator<_InputIterator>::value, int> = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _OutputIterator
 copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result)
 {
   using _IntegralSize = decltype(__convert_to_integral(__orig_n));
@@ -55,7 +55,7 @@ template <class _InputIterator,
           class _Size,
           class _OutputIterator,
           __enable_if_t<__is_cpp17_random_access_iterator<_InputIterator>::value, int> = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result)
 {
   using _IntegralSize = decltype(__convert_to_integral(__orig_n));
diff --git a/libcudacxx/include/cuda/std/__algorithm/count.h b/libcudacxx/include/cuda/std/__algorithm/count.h
index a144e891d5..bc59258668 100644
--- a/libcudacxx/include/cuda/std/__algorithm/count.h
+++ b/libcudacxx/include/cuda/std/__algorithm/count.h
@@ -25,8 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 __iter_diff_t<_InputIterator>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __iter_diff_t<_InputIterator>
 count(_InputIterator __first, _InputIterator __last, const _Tp& __value_)
 {
   __iter_diff_t<_InputIterator> __r{0};
diff --git a/libcudacxx/include/cuda/std/__algorithm/count_if.h b/libcudacxx/include/cuda/std/__algorithm/count_if.h
index d34b208d72..c7b7a404c3 100644
--- a/libcudacxx/include/cuda/std/__algorithm/count_if.h
+++ b/libcudacxx/include/cuda/std/__algorithm/count_if.h
@@ -25,8 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Predicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 __iter_diff_t<_InputIterator>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __iter_diff_t<_InputIterator>
 count_if(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 {
   __iter_diff_t<_InputIterator> __r{0};
diff --git a/libcudacxx/include/cuda/std/__algorithm/equal.h b/libcudacxx/include/cuda/std/__algorithm/equal.h
index 261141412a..23edc6a94e 100644
--- a/libcudacxx/include/cuda/std/__algorithm/equal.h
+++ b/libcudacxx/include/cuda/std/__algorithm/equal.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred)
 {
   for (; __first1 != __last1; ++__first1, (void) ++__first2)
@@ -42,14 +42,14 @@ equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first
 }
 
 template <class _InputIterator1, class _InputIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2)
 {
   return _CUDA_VSTD::equal(__first1, __last1, __first2, __equal_to{});
 }
 
 template <class _BinaryPredicate, class _InputIterator1, class _InputIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool __equal(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __equal(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
@@ -69,7 +69,7 @@ _CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _
 }
 
 template <class _BinaryPredicate, class _RandomAccessIterator1, class _RandomAccessIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool __equal(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __equal(
   _RandomAccessIterator1 __first1,
   _RandomAccessIterator1 __last1,
   _RandomAccessIterator2 __first2,
@@ -87,7 +87,7 @@ _CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _
 }
 
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 equal(_InputIterator1 __first1,
       _InputIterator1 __last1,
       _InputIterator2 __first2,
@@ -105,7 +105,7 @@ equal(_InputIterator1 __first1,
 }
 
 template <class _InputIterator1, class _InputIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2)
 {
   return _CUDA_VSTD::__equal(
diff --git a/libcudacxx/include/cuda/std/__algorithm/equal_range.h b/libcudacxx/include/cuda/std/__algorithm/equal_range.h
index 826680edc5..714b4898db 100644
--- a/libcudacxx/include/cuda/std/__algorithm/equal_range.h
+++ b/libcudacxx/include/cuda/std/__algorithm/equal_range.h
@@ -40,7 +40,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _Iter, class _Sent, class _Tp, class _Proj>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_Iter, _Iter>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Iter, _Iter>
 __equal_range(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp, _Proj&& __proj)
 {
   auto __len  = _IterOps<_AlgPolicy>::distance(__first, __last);
@@ -70,7 +70,7 @@ __equal_range(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp
 }
 
 template <class _ForwardIterator, class _Tp, class _Compare>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
 equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp)
 {
   static_assert(__is_callable<_Compare, decltype(*__first), const _Tp&>::value, "The comparator has to be callable");
@@ -84,7 +84,7 @@ equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __valu
 }
 
 template <class _ForwardIterator, class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
 equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value)
 {
   return _CUDA_VSTD::equal_range(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), __value, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/fill.h b/libcudacxx/include/cuda/std/__algorithm/fill.h
index a4f0202338..b1dc81e9a8 100644
--- a/libcudacxx/include/cuda/std/__algorithm/fill.h
+++ b/libcudacxx/include/cuda/std/__algorithm/fill.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, forward_iterator_tag)
 {
   for (; __first != __last; ++__first)
@@ -36,14 +36,14 @@ __fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, f
 }
 
 template <class _RandomAccessIterator, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value_, random_access_iterator_tag)
 {
   _CUDA_VSTD::fill_n(__first, __last - __first, __value_);
 }
 
 template <class _ForwardIterator, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_)
 {
   _CUDA_VSTD::__fill(__first, __last, __value_, typename iterator_traits<_ForwardIterator>::iterator_category());
diff --git a/libcudacxx/include/cuda/std/__algorithm/fill_n.h b/libcudacxx/include/cuda/std/__algorithm/fill_n.h
index 56f7a5a396..d98fad0727 100644
--- a/libcudacxx/include/cuda/std/__algorithm/fill_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/fill_n.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _OutputIterator, class _Size, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value_)
 {
   for (; __n > 0; ++__first, (void) --__n)
@@ -37,7 +37,7 @@ __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value_)
 }
 
 template <class _OutputIterator, class _Size, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 fill_n(_OutputIterator __first, _Size __n, const _Tp& __value_)
 {
   return _CUDA_VSTD::__fill_n(__first, __convert_to_integral(__n), __value_);
diff --git a/libcudacxx/include/cuda/std/__algorithm/find.h b/libcudacxx/include/cuda/std/__algorithm/find.h
index b23856ea31..e5822dafda 100644
--- a/libcudacxx/include/cuda/std/__algorithm/find.h
+++ b/libcudacxx/include/cuda/std/__algorithm/find.h
@@ -26,7 +26,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // generic implementation
 template <class _Iter, class _Sent, class _Tp, class _Proj>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Iter
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Iter
 __find_impl(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj)
 {
   for (; __first != __last; ++__first)
@@ -40,7 +40,7 @@ __find_impl(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj)
 }
 
 template <class _InputIterator, class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _InputIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _InputIterator
 find(_InputIterator __first, _InputIterator __last, const _Tp& __value_)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/find_end.h b/libcudacxx/include/cuda/std/__algorithm/find_end.h
index 760e5dc33a..d99b1a58dc 100644
--- a/libcudacxx/include/cuda/std/__algorithm/find_end.h
+++ b/libcudacxx/include/cuda/std/__algorithm/find_end.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _BinaryPredicate, class _ForwardIterator1, class _ForwardIterator2>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 __find_end(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 __find_end(
   _ForwardIterator1 __first1,
   _ForwardIterator1 __last1,
   _ForwardIterator2 __first2,
@@ -81,7 +81,7 @@ _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIter
 }
 
 template <class _BinaryPredicate, class _BidirectionalIterator1, class _BidirectionalIterator2>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator1 __find_end(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator1 __find_end(
   _BidirectionalIterator1 __first1,
   _BidirectionalIterator1 __last1,
   _BidirectionalIterator2 __first2,
@@ -134,7 +134,7 @@ _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Bidirection
 }
 
 template <class _BinaryPredicate, class _RandomAccessIterator1, class _RandomAccessIterator2>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator1 __find_end(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator1 __find_end(
   _RandomAccessIterator1 __first1,
   _RandomAccessIterator1 __last1,
   _RandomAccessIterator2 __first2,
@@ -189,7 +189,7 @@ _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAcces
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 find_end(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 find_end(
   _ForwardIterator1 __first1,
   _ForwardIterator1 __last1,
   _ForwardIterator2 __first2,
@@ -207,7 +207,7 @@ _CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Forw
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator1
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator1
 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2)
 {
   return _CUDA_VSTD::find_end(__first1, __last1, __first2, __last2, __equal_to{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/find_first_of.h b/libcudacxx/include/cuda/std/__algorithm/find_first_of.h
index 8569823768..622c43aad7 100644
--- a/libcudacxx/include/cuda/std/__algorithm/find_first_of.h
+++ b/libcudacxx/include/cuda/std/__algorithm/find_first_of.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 __find_first_of_ce(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 __find_first_of_ce(
   _ForwardIterator1 __first1,
   _ForwardIterator1 __last1,
   _ForwardIterator2 __first2,
@@ -46,7 +46,7 @@ _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIter
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 find_first_of(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 find_first_of(
   _ForwardIterator1 __first1,
   _ForwardIterator1 __last1,
   _ForwardIterator2 __first2,
@@ -57,7 +57,7 @@ _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIter
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 find_first_of(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 find_first_of(
   _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2)
 {
   return _CUDA_VSTD::__find_first_of_ce(__first1, __last1, __first2, __last2, __equal_to{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/find_if.h b/libcudacxx/include/cuda/std/__algorithm/find_if.h
index c249e64cfc..dfda3c22c9 100644
--- a/libcudacxx/include/cuda/std/__algorithm/find_if.h
+++ b/libcudacxx/include/cuda/std/__algorithm/find_if.h
@@ -22,7 +22,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Predicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _InputIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _InputIterator
 find_if(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/find_if_not.h b/libcudacxx/include/cuda/std/__algorithm/find_if_not.h
index 79cc871da3..d27d5dd231 100644
--- a/libcudacxx/include/cuda/std/__algorithm/find_if_not.h
+++ b/libcudacxx/include/cuda/std/__algorithm/find_if_not.h
@@ -22,7 +22,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Predicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _InputIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _InputIterator
 find_if_not(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/for_each.h b/libcudacxx/include/cuda/std/__algorithm/for_each.h
index 488738a6c8..b0f764376f 100644
--- a/libcudacxx/include/cuda/std/__algorithm/for_each.h
+++ b/libcudacxx/include/cuda/std/__algorithm/for_each.h
@@ -22,7 +22,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Function>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Function
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Function
 for_each(_InputIterator __first, _InputIterator __last, _Function __f)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/for_each_n.h b/libcudacxx/include/cuda/std/__algorithm/for_each_n.h
index 95110d00ef..54428acb69 100644
--- a/libcudacxx/include/cuda/std/__algorithm/for_each_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/for_each_n.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Size, class _Function>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _InputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _InputIterator
 for_each_n(_InputIterator __first, _Size __orig_n, _Function __f)
 {
   typedef decltype(_CUDA_VSTD::__convert_to_integral(__orig_n)) _IntegralSize;
diff --git a/libcudacxx/include/cuda/std/__algorithm/generate.h b/libcudacxx/include/cuda/std/__algorithm/generate.h
index a5917e5344..ca2fe7d1e4 100644
--- a/libcudacxx/include/cuda/std/__algorithm/generate.h
+++ b/libcudacxx/include/cuda/std/__algorithm/generate.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Generator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 generate(_ForwardIterator __first, _ForwardIterator __last, _Generator __gen)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/generate_n.h b/libcudacxx/include/cuda/std/__algorithm/generate_n.h
index c4ecf5aa13..5250b0b74a 100644
--- a/libcudacxx/include/cuda/std/__algorithm/generate_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/generate_n.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _OutputIterator, class _Size, class _Generator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen)
 {
   using _IntegralSize = decltype(__convert_to_integral(__orig_n));
diff --git a/libcudacxx/include/cuda/std/__algorithm/half_positive.h b/libcudacxx/include/cuda/std/__algorithm/half_positive.h
index c30c40a514..a31b4d4456 100644
--- a/libcudacxx/include/cuda/std/__algorithm/half_positive.h
+++ b/libcudacxx/include/cuda/std/__algorithm/half_positive.h
@@ -29,13 +29,13 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // Perform division by two quickly for positive integers (llvm.org/PR39129)
 
 template <class _Integral, __enable_if_t<_CCCL_TRAIT(is_integral, _Integral), int> = 0>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr _Integral __half_positive(_Integral __value)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Integral __half_positive(_Integral __value)
 {
   return static_cast<_Integral>(static_cast<__make_unsigned_t<_Integral>>(__value) / 2);
 }
 
 template <class _Tp, __enable_if_t<!_CCCL_TRAIT(is_integral, _Tp), int> = 0>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp __half_positive(_Tp __value)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __half_positive(_Tp __value)
 {
   return __value / 2;
 }
diff --git a/libcudacxx/include/cuda/std/__algorithm/includes.h b/libcudacxx/include/cuda/std/__algorithm/includes.h
index 40058aeded..13ad878780 100644
--- a/libcudacxx/include/cuda/std/__algorithm/includes.h
+++ b/libcudacxx/include/cuda/std/__algorithm/includes.h
@@ -31,7 +31,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Comp, class _Proj1, class _Proj2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool __includes(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __includes(
   _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Sent2 __last2, _Comp&& __comp, _Proj1&& __proj1, _Proj2&& __proj2)
 {
   for (; __first2 != __last2; ++__first1)
@@ -52,7 +52,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool __includes(
 }
 
 template <class _InputIterator1, class _InputIterator2, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool includes(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool includes(
   _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2, _Compare __comp)
 {
   static_assert(__is_callable<_Compare, decltype(*__first1), decltype(*__first2)>::value,
@@ -69,7 +69,7 @@ _CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
 }
 
 template <class _InputIterator1, class _InputIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 includes(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2)
 {
   return _CUDA_VSTD::includes(
diff --git a/libcudacxx/include/cuda/std/__algorithm/is_heap.h b/libcudacxx/include/cuda/std/__algorithm/is_heap.h
index 8e82c768d2..e91d3aaaec 100644
--- a/libcudacxx/include/cuda/std/__algorithm/is_heap.h
+++ b/libcudacxx/include/cuda/std/__algorithm/is_heap.h
@@ -28,14 +28,14 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _RandomAccessIterator, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 is_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
   return _CUDA_VSTD::__is_heap_until(__first, __last, static_cast<__comp_ref_type<_Compare>>(__comp)) == __last;
 }
 
 template <class _RandomAccessIterator>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 is_heap(_RandomAccessIterator __first, _RandomAccessIterator __last)
 {
   return _CUDA_VSTD::is_heap(__first, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h b/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h
index 4548d6c380..ace9c53966 100644
--- a/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h
+++ b/libcudacxx/include/cuda/std/__algorithm/is_heap_until.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Compare, class _RandomAccessIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 __is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare&& __comp)
 {
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
@@ -60,14 +60,14 @@ __is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Co
 }
 
 template <class _RandomAccessIterator, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
   return _CUDA_VSTD::__is_heap_until(__first, __last, static_cast<__comp_ref_type<_Compare>>(__comp));
 }
 
 template <class _RandomAccessIterator>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last)
 {
   return _CUDA_VSTD::__is_heap_until(__first, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/is_partitioned.h b/libcudacxx/include/cuda/std/__algorithm/is_partitioned.h
index 5162172b75..04f13112e0 100644
--- a/libcudacxx/include/cuda/std/__algorithm/is_partitioned.h
+++ b/libcudacxx/include/cuda/std/__algorithm/is_partitioned.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Predicate>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 is_partitioned(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/is_permutation.h b/libcudacxx/include/cuda/std/__algorithm/is_permutation.h
index dc023cce1b..d72526ffa5 100644
--- a/libcudacxx/include/cuda/std/__algorithm/is_permutation.h
+++ b/libcudacxx/include/cuda/std/__algorithm/is_permutation.h
@@ -29,7 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool is_permutation(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool is_permutation(
   _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _BinaryPredicate __pred)
 {
   //  shorten sequences as much as possible by lopping of any equal prefix
@@ -100,7 +100,7 @@ _CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2)
 {
   return _CUDA_VSTD::is_permutation(__first1, __last1, __first2, __equal_to{});
@@ -108,8 +108,7 @@ is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIt
 
 #if _CCCL_STD_VER > 2011
 template <class _BinaryPredicate, class _ForwardIterator1, class _ForwardIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-__is_permutation(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __is_permutation(
   _ForwardIterator1 __first1,
   _ForwardIterator1 __last1,
   _ForwardIterator2 __first2,
@@ -192,8 +191,7 @@ __is_permutation(
 }
 
 template <class _BinaryPredicate, class _RandomAccessIterator1, class _RandomAccessIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-__is_permutation(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __is_permutation(
   _RandomAccessIterator1 __first1,
   _RandomAccessIterator2 __last1,
   _RandomAccessIterator1 __first2,
@@ -212,12 +210,12 @@ __is_permutation(
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-is_permutation(_ForwardIterator1 __first1,
-               _ForwardIterator1 __last1,
-               _ForwardIterator2 __first2,
-               _ForwardIterator2 __last2,
-               _BinaryPredicate __pred)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool is_permutation(
+  _ForwardIterator1 __first1,
+  _ForwardIterator1 __last1,
+  _ForwardIterator2 __first2,
+  _ForwardIterator2 __last2,
+  _BinaryPredicate __pred)
 {
   return _CUDA_VSTD::__is_permutation<__add_lvalue_reference_t<_BinaryPredicate>>(
     __first1,
@@ -230,7 +228,7 @@ is_permutation(_ForwardIterator1 __first1,
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool is_permutation(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool is_permutation(
   _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2)
 {
   return _CUDA_VSTD::__is_permutation(
diff --git a/libcudacxx/include/cuda/std/__algorithm/is_sorted.h b/libcudacxx/include/cuda/std/__algorithm/is_sorted.h
index 51b9877eee..7a43424132 100644
--- a/libcudacxx/include/cuda/std/__algorithm/is_sorted.h
+++ b/libcudacxx/include/cuda/std/__algorithm/is_sorted.h
@@ -28,14 +28,14 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 is_sorted(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp)
 {
   return _CUDA_VSTD::__is_sorted_until<__comp_ref_type<_Compare>>(__first, __last, __comp) == __last;
 }
 
 template <class _ForwardIterator>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 is_sorted(_ForwardIterator __first, _ForwardIterator __last)
 {
   return _CUDA_VSTD::is_sorted(__first, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/is_sorted_until.h b/libcudacxx/include/cuda/std/__algorithm/is_sorted_until.h
index 149f2d8a81..6008733a76 100644
--- a/libcudacxx/include/cuda/std/__algorithm/is_sorted_until.h
+++ b/libcudacxx/include/cuda/std/__algorithm/is_sorted_until.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Compare, class _ForwardIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 __is_sorted_until(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp)
 {
   if (__first != __last)
@@ -46,14 +46,14 @@ __is_sorted_until(_ForwardIterator __first, _ForwardIterator __last, _Compare __
 }
 
 template <class _ForwardIterator, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 is_sorted_until(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp)
 {
   return _CUDA_VSTD::__is_sorted_until<__comp_ref_type<_Compare>>(__first, __last, __comp);
 }
 
 template <class _ForwardIterator>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 is_sorted_until(_ForwardIterator __first, _ForwardIterator __last)
 {
   return _CUDA_VSTD::is_sorted_until(__first, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/iter_swap.h b/libcudacxx/include/cuda/std/__algorithm/iter_swap.h
index 64a20465bf..4e89efdb76 100644
--- a/libcudacxx/include/cuda/std/__algorithm/iter_swap.h
+++ b/libcudacxx/include/cuda/std/__algorithm/iter_swap.h
@@ -26,8 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator1, class _ForwardIterator2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-iter_swap(_ForwardIterator1 __a, _ForwardIterator2 __b) noexcept(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void iter_swap(_ForwardIterator1 __a, _ForwardIterator2 __b) noexcept(
   noexcept(swap(*_CUDA_VSTD::declval<_ForwardIterator1>(), *_CUDA_VSTD::declval<_ForwardIterator2>())))
 {
   swap(*__a, *__b);
diff --git a/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h b/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h
index b33969732b..1741cebcdc 100644
--- a/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h
+++ b/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h
@@ -88,14 +88,14 @@ struct _IterOps<_ClassicAlgPolicy>
 
   // advance
   template <class _Iter, class _Distance>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 static void advance(_Iter& __iter, _Distance __count)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 static void advance(_Iter& __iter, _Distance __count)
   {
     _CUDA_VSTD::advance(__iter, __count);
   }
 
   // distance
   template <class _Iter>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 static typename iterator_traits<_Iter>::difference_type
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 static typename iterator_traits<_Iter>::difference_type
   distance(_Iter __first, _Iter __last)
   {
     return _CUDA_VSTD::distance(__first, __last);
@@ -108,7 +108,7 @@ struct _IterOps<_ClassicAlgPolicy>
   using __move_t = decltype(_CUDA_VSTD::move(*_CUDA_VSTD::declval<_Iter&>()));
 
   template <class _Iter>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 static void __validate_iter_reference()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 static void __validate_iter_reference()
   {
     static_assert(
       is_same<__deref_t<_Iter>, typename iterator_traits<__remove_cvref_t<_Iter>>::reference>::value,
@@ -119,7 +119,7 @@ struct _IterOps<_ClassicAlgPolicy>
 
   // iter_move
   template <class _Iter, __enable_if_t<is_reference<__deref_t<_Iter>>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 static
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 static
     // If the result of dereferencing `_Iter` is a reference type, deduce the result of calling `_CUDA_VSTD::move` on
     // it. Note that the C++03 mode doesn't support `decltype(auto)` as the return type.
     __move_t<_Iter>
@@ -131,7 +131,7 @@ struct _IterOps<_ClassicAlgPolicy>
   }
 
   template <class _Iter, __enable_if_t<!is_reference<__deref_t<_Iter>>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 static
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 static
     // If the result of dereferencing `_Iter` is a value type, deduce the return value of this function to also be a
     // value -- otherwise, after `operator*` returns a temporary, this function would return a dangling reference to
     // that temporary. Note that the C++03 mode doesn't support `auto` as the return type.
@@ -145,20 +145,20 @@ struct _IterOps<_ClassicAlgPolicy>
 
   // iter_swap
   template <class _Iter1, class _Iter2>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 static void iter_swap(_Iter1&& __a, _Iter2&& __b)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 static void iter_swap(_Iter1&& __a, _Iter2&& __b)
   {
     _CUDA_VSTD::iter_swap(_CUDA_VSTD::forward<_Iter1>(__a), _CUDA_VSTD::forward<_Iter2>(__b));
   }
 
   // next
   template <class _Iterator>
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 _Iterator next(_Iterator, _Iterator __last)
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 _Iterator next(_Iterator, _Iterator __last)
   {
     return __last;
   }
 
   template <class _Iter>
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 __remove_cvref_t<_Iter>
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 __remove_cvref_t<_Iter>
   next(_Iter&& __it, __difference_type<__remove_cvref_t<_Iter>> __n = 1)
   {
     return _CUDA_VSTD::next(_CUDA_VSTD::forward<_Iter>(__it), __n);
@@ -166,14 +166,14 @@ struct _IterOps<_ClassicAlgPolicy>
 
   // prev
   template <class _Iter>
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 __remove_cvref_t<_Iter>
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 __remove_cvref_t<_Iter>
   prev(_Iter&& __iter, __difference_type<__remove_cvref_t<_Iter>> __n = 1)
   {
     return _CUDA_VSTD::prev(_CUDA_VSTD::forward<_Iter>(__iter), __n);
   }
 
   template <class _Iter>
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 void __advance_to(_Iter& __first, _Iter __last)
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 void __advance_to(_Iter& __first, _Iter __last)
   {
     __first = __last;
   }
diff --git a/libcudacxx/include/cuda/std/__algorithm/lexicographical_compare.h b/libcudacxx/include/cuda/std/__algorithm/lexicographical_compare.h
index 93f102d868..2de7c7903f 100644
--- a/libcudacxx/include/cuda/std/__algorithm/lexicographical_compare.h
+++ b/libcudacxx/include/cuda/std/__algorithm/lexicographical_compare.h
@@ -27,8 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Compare, class _InputIterator1, class _InputIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-__lexicographical_compare(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __lexicographical_compare(
   _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2, _Compare __comp)
 {
   for (; __first2 != __last2; ++__first1, (void) ++__first2)
@@ -46,16 +45,14 @@ __lexicographical_compare(
 }
 
 template <class _InputIterator1, class _InputIterator2, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-lexicographical_compare(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool lexicographical_compare(
   _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2, _Compare __comp)
 {
   return __lexicographical_compare<__comp_ref_type<_Compare>>(__first1, __last1, __first2, __last2, __comp);
 }
 
 template <class _InputIterator1, class _InputIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-lexicographical_compare(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool lexicographical_compare(
   _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2)
 {
   return _CUDA_VSTD::lexicographical_compare(__first1, __last1, __first2, __last2, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/lower_bound.h b/libcudacxx/include/cuda/std/__algorithm/lower_bound.h
index a36be90433..aa1b6939ab 100644
--- a/libcudacxx/include/cuda/std/__algorithm/lower_bound.h
+++ b/libcudacxx/include/cuda/std/__algorithm/lower_bound.h
@@ -34,7 +34,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Iter
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Iter
 __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj)
 {
   auto __len = _IterOps<_AlgPolicy>::distance(__first, __last);
@@ -58,7 +58,7 @@ __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp,
 }
 
 template <class _ForwardIterator, class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp)
 {
   static_assert(__is_callable<_Compare, decltype(*__first), const _Tp&>::value, "The comparator has to be callable");
@@ -67,7 +67,7 @@ lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __valu
 }
 
 template <class _ForwardIterator, class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value)
 {
   return _CUDA_VSTD::lower_bound(__first, __last, __value, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/make_heap.h b/libcudacxx/include/cuda/std/__algorithm/make_heap.h
index a7c6b40b8e..7b56b018d6 100644
--- a/libcudacxx/include/cuda/std/__algorithm/make_heap.h
+++ b/libcudacxx/include/cuda/std/__algorithm/make_heap.h
@@ -30,7 +30,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare&& __comp)
 {
   __comp_ref_type<_Compare> __comp_ref = __comp;
@@ -48,14 +48,14 @@ __make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compar
 }
 
 template <class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
   _CUDA_VSTD::__make_heap<_ClassicAlgPolicy>(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), __comp);
 }
 
 template <class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last)
 {
   _CUDA_VSTD::make_heap(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/make_projected.h b/libcudacxx/include/cuda/std/__algorithm/make_projected.h
index 2bd5764c1a..a3ca026c04 100644
--- a/libcudacxx/include/cuda/std/__algorithm/make_projected.h
+++ b/libcudacxx/include/cuda/std/__algorithm/make_projected.h
@@ -39,7 +39,7 @@ struct _ProjectedPred
   _Pred& __pred; // Can be a unary or a binary predicate.
   _Proj& __proj;
 
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY _ProjectedPred(_Pred& __pred_arg, _Proj& __proj_arg)
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI _ProjectedPred(_Pred& __pred_arg, _Proj& __proj_arg)
       : __pred(__pred_arg)
       , __proj(__proj_arg)
   {}
@@ -48,7 +48,7 @@ struct _ProjectedPred
   typename __invoke_of<
     _Pred&,
     decltype(_CUDA_VSTD::__invoke(_CUDA_VSTD::declval<_Proj&>(), _CUDA_VSTD::declval<_Tp>()))>::type constexpr
-    _LIBCUDACXX_INLINE_VISIBILITY
+    _LIBCUDACXX_HIDE_FROM_ABI
     operator()(_Tp&& __v) const
   {
     return _CUDA_VSTD::__invoke(__pred, _CUDA_VSTD::__invoke(__proj, _CUDA_VSTD::forward<_Tp>(__v)));
@@ -59,7 +59,7 @@ struct _ProjectedPred
     _Pred&,
     decltype(_CUDA_VSTD::__invoke(_CUDA_VSTD::declval<_Proj&>(), _CUDA_VSTD::declval<_T1>())),
     decltype(_CUDA_VSTD::__invoke(_CUDA_VSTD::declval<_Proj&>(), _CUDA_VSTD::declval<_T2>()))>::type constexpr
-    _LIBCUDACXX_INLINE_VISIBILITY
+    _LIBCUDACXX_HIDE_FROM_ABI
     operator()(_T1&& __lhs, _T2&& __rhs) const
   {
     return _CUDA_VSTD::__invoke(__pred,
@@ -72,7 +72,7 @@ template <
   class _Pred,
   class _Proj,
   __enable_if_t<!(!is_member_pointer<__decay_t<_Pred>>::value && __is_identity<__decay_t<_Proj>>::value), int> = 0>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _ProjectedPred<_Pred, _Proj> __make_projected(_Pred& __pred, _Proj& __proj)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _ProjectedPred<_Pred, _Proj> __make_projected(_Pred& __pred, _Proj& __proj)
 {
   return _ProjectedPred<_Pred, _Proj>(__pred, __proj);
 }
@@ -83,7 +83,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr _ProjectedPred<_Pred, _Proj> __make_proj
 template <class _Pred,
           class _Proj,
           __enable_if_t<!is_member_pointer<__decay_t<_Pred>>::value && __is_identity<__decay_t<_Proj>>::value, int> = 0>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Pred& __make_projected(_Pred& __pred, _Proj&)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Pred& __make_projected(_Pred& __pred, _Proj&)
 {
   return __pred;
 }
diff --git a/libcudacxx/include/cuda/std/__algorithm/max.h b/libcudacxx/include/cuda/std/__algorithm/max.h
index 28677d6b7a..916f87a754 100644
--- a/libcudacxx/include/cuda/std/__algorithm/max.h
+++ b/libcudacxx/include/cuda/std/__algorithm/max.h
@@ -30,27 +30,25 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp&
-max(const _Tp& __a, const _Tp& __b, _Compare __comp)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& max(const _Tp& __a, const _Tp& __b, _Compare __comp)
 {
   return __comp(__a, __b) ? __b : __a;
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& max(const _Tp& __a, const _Tp& __b)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& max(const _Tp& __a, const _Tp& __b)
 {
   return _CUDA_VSTD::max(__a, __b, __less{});
 }
 
 template <class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
-max(initializer_list<_Tp> __t, _Compare __comp)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp max(initializer_list<_Tp> __t, _Compare __comp)
 {
   return *_CUDA_VSTD::__max_element<__comp_ref_type<_Compare>>(__t.begin(), __t.end(), __comp);
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp max(initializer_list<_Tp> __t)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp max(initializer_list<_Tp> __t)
 {
   return *_CUDA_VSTD::max_element(__t.begin(), __t.end(), __less{});
 }
diff --git a/libcudacxx/include/cuda/std/__algorithm/max_element.h b/libcudacxx/include/cuda/std/__algorithm/max_element.h
index e5eed1a9a1..337352f194 100644
--- a/libcudacxx/include/cuda/std/__algorithm/max_element.h
+++ b/libcudacxx/include/cuda/std/__algorithm/max_element.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Compare, class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 __max_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp)
 {
   static_assert(__is_cpp17_input_iterator<_ForwardIterator>::value,
@@ -47,14 +47,14 @@ __max_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp
 }
 
 template <class _ForwardIterator, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 max_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp)
 {
   return _CUDA_VSTD::__max_element<__comp_ref_type<_Compare>>(__first, __last, __comp);
 }
 
 template <class _ForwardIterator>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 max_element(_ForwardIterator __first, _ForwardIterator __last)
 {
   return _CUDA_VSTD::max_element(__first, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/merge.h b/libcudacxx/include/cuda/std/__algorithm/merge.h
index e3336253d8..886fe0ec15 100644
--- a/libcudacxx/include/cuda/std/__algorithm/merge.h
+++ b/libcudacxx/include/cuda/std/__algorithm/merge.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Compare, class _InputIterator1, class _InputIterator2, class _OutputIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator __merge(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator __merge(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
@@ -57,7 +57,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator __merge(
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator merge(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator merge(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
@@ -69,7 +69,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator merge
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator merge(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator merge(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
diff --git a/libcudacxx/include/cuda/std/__algorithm/min.h b/libcudacxx/include/cuda/std/__algorithm/min.h
index 047d3eb294..3feb0bfb3a 100644
--- a/libcudacxx/include/cuda/std/__algorithm/min.h
+++ b/libcudacxx/include/cuda/std/__algorithm/min.h
@@ -30,27 +30,25 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp&
-min(const _Tp& __a, const _Tp& __b, _Compare __comp)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& min(const _Tp& __a, const _Tp& __b, _Compare __comp)
 {
   return __comp(__b, __a) ? __b : __a;
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& min(const _Tp& __a, const _Tp& __b)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& min(const _Tp& __a, const _Tp& __b)
 {
   return _CUDA_VSTD::min(__a, __b, __less{});
 }
 
 template <class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
-min(initializer_list<_Tp> __t, _Compare __comp)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp min(initializer_list<_Tp> __t, _Compare __comp)
 {
   return *_CUDA_VSTD::__min_element<__comp_ref_type<_Compare>>(__t.begin(), __t.end(), __comp);
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp min(initializer_list<_Tp> __t)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp min(initializer_list<_Tp> __t)
 {
   return *_CUDA_VSTD::min_element(__t.begin(), __t.end(), __less{});
 }
diff --git a/libcudacxx/include/cuda/std/__algorithm/min_element.h b/libcudacxx/include/cuda/std/__algorithm/min_element.h
index c7d5592a84..fb5db9780d 100644
--- a/libcudacxx/include/cuda/std/__algorithm/min_element.h
+++ b/libcudacxx/include/cuda/std/__algorithm/min_element.h
@@ -33,7 +33,7 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Comp, class _Iter, class _Sent, class _Proj>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Iter
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Iter
 __min_element(_Iter __first, _Sent __last, _Comp __comp, _Proj& __proj)
 {
   if (__first == __last)
@@ -54,14 +54,14 @@ __min_element(_Iter __first, _Sent __last, _Comp __comp, _Proj& __proj)
 }
 
 template <class _Comp, class _Iter, class _Sent>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Iter __min_element(_Iter __first, _Sent __last, _Comp __comp)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Iter __min_element(_Iter __first, _Sent __last, _Comp __comp)
 {
   auto __proj = __identity();
   return _CUDA_VSTD::__min_element<_Comp>(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), __comp, __proj);
 }
 
 template <class _ForwardIterator, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 min_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp)
 {
   static_assert(__is_cpp17_input_iterator<_ForwardIterator>::value, "std::min_element requires a ForwardIterator");
@@ -73,7 +73,7 @@ min_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp)
 }
 
 template <class _ForwardIterator>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 min_element(_ForwardIterator __first, _ForwardIterator __last)
 {
   return _CUDA_VSTD::min_element(__first, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/minmax.h b/libcudacxx/include/cuda/std/__algorithm/minmax.h
index f6560bd03a..d0f1cb3c34 100644
--- a/libcudacxx/include/cuda/std/__algorithm/minmax.h
+++ b/libcudacxx/include/cuda/std/__algorithm/minmax.h
@@ -30,14 +30,14 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<const _Tp&, const _Tp&>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<const _Tp&, const _Tp&>
 minmax(const _Tp& __a, const _Tp& __b, _Compare __comp)
 {
   return __comp(__b, __a) ? pair<const _Tp&, const _Tp&>(__b, __a) : pair<const _Tp&, const _Tp&>(__a, __b);
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<const _Tp&, const _Tp&>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<const _Tp&, const _Tp&>
 minmax(const _Tp& __a, const _Tp& __b)
 {
   return _CUDA_VSTD::minmax(__a, __b, __less{});
@@ -46,7 +46,7 @@ minmax(const _Tp& __a, const _Tp& __b)
 #ifndef _LIBCUDACXX_CXX03_LANG
 
 template <class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_Tp, _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Tp, _Tp>
 minmax(initializer_list<_Tp> __t, _Compare __comp)
 {
   static_assert(__is_callable<_Compare, _Tp, _Tp>::value, "The comparator has to be callable");
@@ -56,8 +56,7 @@ minmax(initializer_list<_Tp> __t, _Compare __comp)
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_Tp, _Tp>
-minmax(initializer_list<_Tp> __t)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Tp, _Tp> minmax(initializer_list<_Tp> __t)
 {
   return _CUDA_VSTD::minmax(__t, __less{});
 }
diff --git a/libcudacxx/include/cuda/std/__algorithm/minmax_element.h b/libcudacxx/include/cuda/std/__algorithm/minmax_element.h
index c848e410a5..4abf68e440 100644
--- a/libcudacxx/include/cuda/std/__algorithm/minmax_element.h
+++ b/libcudacxx/include/cuda/std/__algorithm/minmax_element.h
@@ -36,20 +36,20 @@ class _MinmaxElementLessFunc
   _Proj& __proj_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _MinmaxElementLessFunc(_Comp& __comp, _Proj& __proj)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _MinmaxElementLessFunc(_Comp& __comp, _Proj& __proj)
       : __comp_(__comp)
       , __proj_(__proj)
   {}
 
   template <class _Iter>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator()(_Iter& __it1, _Iter& __it2)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(_Iter& __it1, _Iter& __it2)
   {
     return _CUDA_VSTD::__invoke(__comp_, _CUDA_VSTD::__invoke(__proj_, *__it1), _CUDA_VSTD::__invoke(__proj_, *__it2));
   }
 };
 
 template <class _Iter, class _Sent, class _Proj, class _Comp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_Iter, _Iter>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Iter, _Iter>
 __minmax_element_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj)
 {
   auto __less = _MinmaxElementLessFunc<_Comp, _Proj>(__comp, __proj);
@@ -113,7 +113,7 @@ __minmax_element_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj)
 }
 
 template <class _ForwardIterator, class _Compare>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
 minmax_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp)
 {
   static_assert(__is_cpp17_input_iterator<_ForwardIterator>::value,
@@ -125,7 +125,7 @@ minmax_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __com
 }
 
 template <class _ForwardIterator>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
 minmax_element(_ForwardIterator __first, _ForwardIterator __last)
 {
   return _CUDA_VSTD::minmax_element(__first, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/mismatch.h b/libcudacxx/include/cuda/std/__algorithm/mismatch.h
index 816e4352fa..90c2608216 100644
--- a/libcudacxx/include/cuda/std/__algorithm/mismatch.h
+++ b/libcudacxx/include/cuda/std/__algorithm/mismatch.h
@@ -27,8 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 pair<_InputIterator1, _InputIterator2>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_InputIterator1, _InputIterator2>
 mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred)
 {
   for (; __first1 != __last1; ++__first1, (void) ++__first2)
@@ -42,8 +41,7 @@ mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi
 }
 
 template <class _InputIterator1, class _InputIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 pair<_InputIterator1, _InputIterator2>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_InputIterator1, _InputIterator2>
 mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2)
 {
   return _CUDA_VSTD::mismatch(__first1, __last1, __first2, __equal_to{});
@@ -51,13 +49,12 @@ mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi
 
 #if _CCCL_STD_VER > 2011
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 pair<_InputIterator1, _InputIterator2>
-mismatch(_InputIterator1 __first1,
-         _InputIterator1 __last1,
-         _InputIterator2 __first2,
-         _InputIterator2 __last2,
-         _BinaryPredicate __pred)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_InputIterator1, _InputIterator2> mismatch(
+  _InputIterator1 __first1,
+  _InputIterator1 __last1,
+  _InputIterator2 __first2,
+  _InputIterator2 __last2,
+  _BinaryPredicate __pred)
 {
   for (; __first1 != __last1 && __first2 != __last2; ++__first1, (void) ++__first2)
   {
@@ -70,8 +67,7 @@ mismatch(_InputIterator1 __first1,
 }
 
 template <class _InputIterator1, class _InputIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 pair<_InputIterator1, _InputIterator2>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_InputIterator1, _InputIterator2>
 mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2)
 {
   return _CUDA_VSTD::mismatch(__first1, __last1, __first2, __last2, __equal_to{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/move.h b/libcudacxx/include/cuda/std/__algorithm/move.h
index 4a580416f8..617aad42b4 100644
--- a/libcudacxx/include/cuda/std/__algorithm/move.h
+++ b/libcudacxx/include/cuda/std/__algorithm/move.h
@@ -32,8 +32,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _InputIterator, class _OutputIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 pair<_InputIterator, _OutputIterator>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_InputIterator, _OutputIterator>
 __move(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
   for (; __first != __last; ++__first, (void) ++__result)
@@ -48,8 +47,7 @@ template <class _AlgPolicy,
           class _Up,
           __enable_if_t<_CCCL_TRAIT(is_same, __remove_const_t<_Tp>, _Up), int> = 0,
           __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _Up), int>          = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_Tp*, _Up*>
-__move(_Tp* __first, _Tp* __last, _Up* __result)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Tp*, _Up*> __move(_Tp* __first, _Tp* __last, _Up* __result)
 {
   const ptrdiff_t __n = __last - __first;
   if (__n > 0)
@@ -67,7 +65,7 @@ __move(_Tp* __first, _Tp* __last, _Up* __result)
 }
 
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 move(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
   static_assert(_CCCL_TRAIT(is_copy_constructible, _InputIterator), "Iterators has to be copy constructible.");
diff --git a/libcudacxx/include/cuda/std/__algorithm/move_backward.h b/libcudacxx/include/cuda/std/__algorithm/move_backward.h
index 3b6cd01ad7..fbbcf5980f 100644
--- a/libcudacxx/include/cuda/std/__algorithm/move_backward.h
+++ b/libcudacxx/include/cuda/std/__algorithm/move_backward.h
@@ -31,8 +31,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _BidirectionalIterator, class _OutputIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 pair<_BidirectionalIterator, _OutputIterator>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_BidirectionalIterator, _OutputIterator>
 __move_backward(_BidirectionalIterator __first, _BidirectionalIterator __last, _OutputIterator __result)
 {
   while (__first != __last)
@@ -47,7 +46,7 @@ template <class _AlgPolicy,
           class _Up,
           __enable_if_t<_CCCL_TRAIT(is_same, __remove_const_t<_Tp>, _Up), int> = 0,
           __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _Up), int>          = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_Tp*, _Up*>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Tp*, _Up*>
 __move_backward(_Tp* __first, _Tp* __last, _Up* __result)
 {
   const ptrdiff_t __n = __last - __first;
@@ -66,7 +65,7 @@ __move_backward(_Tp* __first, _Tp* __last, _Up* __result)
 }
 
 template <class _BidirectionalIterator1, class _BidirectionalIterator2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator2
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator2
 move_backward(_BidirectionalIterator1 __first, _BidirectionalIterator1 __last, _BidirectionalIterator2 __result)
 {
   return _CUDA_VSTD::__move_backward<_ClassicAlgPolicy>(
diff --git a/libcudacxx/include/cuda/std/__algorithm/next_permutation.h b/libcudacxx/include/cuda/std/__algorithm/next_permutation.h
index 471735f6a1..3710611489 100644
--- a/libcudacxx/include/cuda/std/__algorithm/next_permutation.h
+++ b/libcudacxx/include/cuda/std/__algorithm/next_permutation.h
@@ -31,7 +31,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _BidirectionalIterator, class _Sentinel>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_BidirectionalIterator, bool>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_BidirectionalIterator, bool>
 __next_permutation(_BidirectionalIterator __first, _Sentinel __last, _Compare&& __comp)
 {
   using _Result = pair<_BidirectionalIterator, bool>;
@@ -64,7 +64,7 @@ __next_permutation(_BidirectionalIterator __first, _Sentinel __last, _Compare&&
 }
 
 template <class _BidirectionalIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp)
 {
   return _CUDA_VSTD::__next_permutation<_ClassicAlgPolicy>(
@@ -73,7 +73,7 @@ next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last,
 }
 
 template <class _BidirectionalIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
   return _CUDA_VSTD::next_permutation(__first, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/none_of.h b/libcudacxx/include/cuda/std/__algorithm/none_of.h
index 7f9abba056..ac1f8b663e 100644
--- a/libcudacxx/include/cuda/std/__algorithm/none_of.h
+++ b/libcudacxx/include/cuda/std/__algorithm/none_of.h
@@ -22,7 +22,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Predicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 none_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/partial_sort.h b/libcudacxx/include/cuda/std/__algorithm/partial_sort.h
index b848540b0a..2b72746b63 100644
--- a/libcudacxx/include/cuda/std/__algorithm/partial_sort.h
+++ b/libcudacxx/include/cuda/std/__algorithm/partial_sort.h
@@ -34,7 +34,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator, class _Sentinel>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 __partial_sort_impl(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Sentinel __last, _Compare&& __comp)
 {
   if (__first == __middle)
@@ -60,7 +60,7 @@ __partial_sort_impl(_RandomAccessIterator __first, _RandomAccessIterator __middl
 }
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator, class _Sentinel>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 __partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Sentinel __last, _Compare& __comp)
 {
   if (__first == __middle)
@@ -73,7 +73,7 @@ __partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _S
 }
 
 template <class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void partial_sort(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void partial_sort(
   _RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last, _Compare __comp)
 {
   static_assert(_CCCL_TRAIT(is_copy_constructible, _RandomAccessIterator), "Iterators must be copy constructible.");
@@ -84,7 +84,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void partial_sort(
 }
 
 template <class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last)
 {
   _CUDA_VSTD::partial_sort(__first, __middle, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/partial_sort_copy.h b/libcudacxx/include/cuda/std/__algorithm/partial_sort_copy.h
index d552e45175..57b734e0d1 100644
--- a/libcudacxx/include/cuda/std/__algorithm/partial_sort_copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/partial_sort_copy.h
@@ -44,7 +44,7 @@ template <class _AlgPolicy,
           class _Sentinel2,
           class _Proj1,
           class _Proj2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_InputIterator, _RandomAccessIterator> __partial_sort_copy(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_InputIterator, _RandomAccessIterator> __partial_sort_copy(
   _InputIterator __first,
   _Sentinel1 __last,
   _RandomAccessIterator __result_first,
@@ -81,7 +81,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_InputIterator, _Random
 }
 
 template <class _InputIterator, class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator partial_sort_copy(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator partial_sort_copy(
   _InputIterator __first,
   _InputIterator __last,
   _RandomAccessIterator __result_first,
@@ -103,7 +103,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 }
 
 template <class _InputIterator, class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator partial_sort_copy(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator partial_sort_copy(
   _InputIterator __first,
   _InputIterator __last,
   _RandomAccessIterator __result_first,
diff --git a/libcudacxx/include/cuda/std/__algorithm/partition.h b/libcudacxx/include/cuda/std/__algorithm/partition.h
index a56b171550..1047aa2846 100644
--- a/libcudacxx/include/cuda/std/__algorithm/partition.h
+++ b/libcudacxx/include/cuda/std/__algorithm/partition.h
@@ -29,7 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Predicate, class _AlgPolicy, class _ForwardIterator, class _Sentinel>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
 __partition_impl(_ForwardIterator __first, _Sentinel __last, _Predicate __pred, forward_iterator_tag)
 {
   while (true)
@@ -58,7 +58,7 @@ __partition_impl(_ForwardIterator __first, _Sentinel __last, _Predicate __pred,
 }
 
 template <class _Predicate, class _AlgPolicy, class _BidirectionalIterator, class _Sentinel>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_BidirectionalIterator, _BidirectionalIterator>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_BidirectionalIterator, _BidirectionalIterator>
 __partition_impl(_BidirectionalIterator __first, _Sentinel __sentinel, _Predicate __pred, bidirectional_iterator_tag)
 {
   _BidirectionalIterator __original_last = _IterOps<_AlgPolicy>::next(__first, __sentinel);
@@ -91,7 +91,7 @@ __partition_impl(_BidirectionalIterator __first, _Sentinel __sentinel, _Predicat
 }
 
 template <class _AlgPolicy, class _ForwardIterator, class _Sentinel, class _Predicate, class _IterCategory>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator, _ForwardIterator>
 __partition(_ForwardIterator __first, _Sentinel __last, _Predicate&& __pred, _IterCategory __iter_category)
 {
   return _CUDA_VSTD::__partition_impl<__remove_cvref_t<_Predicate>&, _AlgPolicy>(
@@ -99,7 +99,7 @@ __partition(_ForwardIterator __first, _Sentinel __last, _Predicate&& __pred, _It
 }
 
 template <class _ForwardIterator, class _Predicate>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
   using _IterCategory = typename iterator_traits<_ForwardIterator>::iterator_category;
diff --git a/libcudacxx/include/cuda/std/__algorithm/partition_copy.h b/libcudacxx/include/cuda/std/__algorithm/partition_copy.h
index d8dfce885d..48ad20b7fd 100644
--- a/libcudacxx/include/cuda/std/__algorithm/partition_copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/partition_copy.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator1, class _OutputIterator2, class _Predicate>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_OutputIterator1, _OutputIterator2> partition_copy(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_OutputIterator1, _OutputIterator2> partition_copy(
   _InputIterator __first,
   _InputIterator __last,
   _OutputIterator1 __out_true,
diff --git a/libcudacxx/include/cuda/std/__algorithm/partition_point.h b/libcudacxx/include/cuda/std/__algorithm/partition_point.h
index 31bb5d7cbf..446c74bad4 100644
--- a/libcudacxx/include/cuda/std/__algorithm/partition_point.h
+++ b/libcudacxx/include/cuda/std/__algorithm/partition_point.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Predicate>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 partition_point(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
   typedef typename iterator_traits<_ForwardIterator>::difference_type difference_type;
diff --git a/libcudacxx/include/cuda/std/__algorithm/pop_heap.h b/libcudacxx/include/cuda/std/__algorithm/pop_heap.h
index 5173e616c0..6b15a27655 100644
--- a/libcudacxx/include/cuda/std/__algorithm/pop_heap.h
+++ b/libcudacxx/include/cuda/std/__algorithm/pop_heap.h
@@ -34,7 +34,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void __pop_heap(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __pop_heap(
   _RandomAccessIterator __first,
   _RandomAccessIterator __last,
   _Compare& __comp,
@@ -67,7 +67,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void __pop_heap(
 }
 
 template <class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
   static_assert(_CUDA_VSTD::is_copy_constructible<_RandomAccessIterator>::value,
@@ -79,7 +79,7 @@ pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare _
 }
 
 template <class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last)
 {
   _CUDA_VSTD::pop_heap(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/prev_permutation.h b/libcudacxx/include/cuda/std/__algorithm/prev_permutation.h
index b177c4c128..4a3d08c455 100644
--- a/libcudacxx/include/cuda/std/__algorithm/prev_permutation.h
+++ b/libcudacxx/include/cuda/std/__algorithm/prev_permutation.h
@@ -31,7 +31,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _BidirectionalIterator, class _Sentinel>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_BidirectionalIterator, bool>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_BidirectionalIterator, bool>
 __prev_permutation(_BidirectionalIterator __first, _Sentinel __last, _Compare&& __comp)
 {
   using _Result = pair<_BidirectionalIterator, bool>;
@@ -64,7 +64,7 @@ __prev_permutation(_BidirectionalIterator __first, _Sentinel __last, _Compare&&
 }
 
 template <class _BidirectionalIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp)
 {
   return _CUDA_VSTD::__prev_permutation<_ClassicAlgPolicy>(
@@ -73,7 +73,7 @@ prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last,
 }
 
 template <class _BidirectionalIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
   return _CUDA_VSTD::prev_permutation(__first, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/push_heap.h b/libcudacxx/include/cuda/std/__algorithm/push_heap.h
index a9fbe3f153..7a0594f1b5 100644
--- a/libcudacxx/include/cuda/std/__algorithm/push_heap.h
+++ b/libcudacxx/include/cuda/std/__algorithm/push_heap.h
@@ -31,7 +31,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __sift_up(_RandomAccessIterator __first,
           _RandomAccessIterator __last,
           _Compare&& __comp,
@@ -65,7 +65,7 @@ __sift_up(_RandomAccessIterator __first,
 }
 
 template <class _AlgPolicy, class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare& __comp)
 {
   typename iterator_traits<_RandomAccessIterator>::difference_type __len = __last - __first;
@@ -74,7 +74,7 @@ __push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compar
 }
 
 template <class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
   static_assert(_CUDA_VSTD::is_copy_constructible<_RandomAccessIterator>::value,
@@ -85,7 +85,7 @@ push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare
 }
 
 template <class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last)
 {
   _CUDA_VSTD::push_heap(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h b/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h
index cc6d6866ca..127171281b 100644
--- a/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h
+++ b/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h
@@ -29,7 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
 template <class _IterMaybeQualified>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto __get_iterator_concept()
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto __get_iterator_concept()
 {
   using _Iter = __remove_cvref_t<_IterMaybeQualified>;
 
diff --git a/libcudacxx/include/cuda/std/__algorithm/remove.h b/libcudacxx/include/cuda/std/__algorithm/remove.h
index 64cc12d660..b4d5ac9ff3 100644
--- a/libcudacxx/include/cuda/std/__algorithm/remove.h
+++ b/libcudacxx/include/cuda/std/__algorithm/remove.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 remove(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_)
 {
   __first = _CUDA_VSTD::find(__first, __last, __value_);
diff --git a/libcudacxx/include/cuda/std/__algorithm/remove_copy.h b/libcudacxx/include/cuda/std/__algorithm/remove_copy.h
index e490dc0f7e..6f0dc3892b 100644
--- a/libcudacxx/include/cuda/std/__algorithm/remove_copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/remove_copy.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 remove_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result, const _Tp& __value_)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/remove_copy_if.h b/libcudacxx/include/cuda/std/__algorithm/remove_copy_if.h
index 1ee2d21812..8ffeb3603f 100644
--- a/libcudacxx/include/cuda/std/__algorithm/remove_copy_if.h
+++ b/libcudacxx/include/cuda/std/__algorithm/remove_copy_if.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _Predicate>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 remove_copy_if(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _Predicate __pred)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/remove_if.h b/libcudacxx/include/cuda/std/__algorithm/remove_if.h
index 9e9cfec248..212a7bb087 100644
--- a/libcudacxx/include/cuda/std/__algorithm/remove_if.h
+++ b/libcudacxx/include/cuda/std/__algorithm/remove_if.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Predicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 remove_if(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
   __first = _CUDA_VSTD::find_if<_ForwardIterator, __add_lvalue_reference_t<_Predicate>>(__first, __last, __pred);
diff --git a/libcudacxx/include/cuda/std/__algorithm/replace.h b/libcudacxx/include/cuda/std/__algorithm/replace.h
index 68a7ecdf00..02eb930b5d 100644
--- a/libcudacxx/include/cuda/std/__algorithm/replace.h
+++ b/libcudacxx/include/cuda/std/__algorithm/replace.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 replace(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __old_value, const _Tp& __new_value)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/replace_copy.h b/libcudacxx/include/cuda/std/__algorithm/replace_copy.h
index 3f0c59402b..a4f3bb9069 100644
--- a/libcudacxx/include/cuda/std/__algorithm/replace_copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/replace_copy.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator replace_copy(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator replace_copy(
   _InputIterator __first,
   _InputIterator __last,
   _OutputIterator __result,
diff --git a/libcudacxx/include/cuda/std/__algorithm/replace_copy_if.h b/libcudacxx/include/cuda/std/__algorithm/replace_copy_if.h
index 9d5912b90e..74341ede24 100644
--- a/libcudacxx/include/cuda/std/__algorithm/replace_copy_if.h
+++ b/libcudacxx/include/cuda/std/__algorithm/replace_copy_if.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _Predicate, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator replace_copy_if(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator replace_copy_if(
   _InputIterator __first, _InputIterator __last, _OutputIterator __result, _Predicate __pred, const _Tp& __new_value)
 {
   for (; __first != __last; ++__first, (void) ++__result)
diff --git a/libcudacxx/include/cuda/std/__algorithm/replace_if.h b/libcudacxx/include/cuda/std/__algorithm/replace_if.h
index 3b62c78b54..ae7621320f 100644
--- a/libcudacxx/include/cuda/std/__algorithm/replace_if.h
+++ b/libcudacxx/include/cuda/std/__algorithm/replace_if.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Predicate, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 replace_if(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred, const _Tp& __new_value)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__algorithm/reverse.h b/libcudacxx/include/cuda/std/__algorithm/reverse.h
index fd2a614193..d10b26a08d 100644
--- a/libcudacxx/include/cuda/std/__algorithm/reverse.h
+++ b/libcudacxx/include/cuda/std/__algorithm/reverse.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _BidirectionalIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __reverse_impl(_BidirectionalIterator __first, _BidirectionalIterator __last, bidirectional_iterator_tag)
 {
   while (__first != __last)
@@ -43,7 +43,7 @@ __reverse_impl(_BidirectionalIterator __first, _BidirectionalIterator __last, bi
 }
 
 template <class _AlgPolicy, class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __reverse_impl(_RandomAccessIterator __first, _RandomAccessIterator __last, random_access_iterator_tag)
 {
   if (__first != __last)
@@ -56,14 +56,14 @@ __reverse_impl(_RandomAccessIterator __first, _RandomAccessIterator __last, rand
 }
 
 template <class _AlgPolicy, class _BidirectionalIterator, class _Sentinel>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void __reverse(_BidirectionalIterator __first, _Sentinel __last)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __reverse(_BidirectionalIterator __first, _Sentinel __last)
 {
   using _IterCategory = typename _IterOps<_AlgPolicy>::template __iterator_category<_BidirectionalIterator>;
   _CUDA_VSTD::__reverse_impl<_AlgPolicy>(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), _IterCategory());
 }
 
 template <class _BidirectionalIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 reverse(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
   _CUDA_VSTD::__reverse<_ClassicAlgPolicy>(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last));
diff --git a/libcudacxx/include/cuda/std/__algorithm/reverse_copy.h b/libcudacxx/include/cuda/std/__algorithm/reverse_copy.h
index 83daa584b6..14a531c36c 100644
--- a/libcudacxx/include/cuda/std/__algorithm/reverse_copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/reverse_copy.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _BidirectionalIterator, class _OutputIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 reverse_copy(_BidirectionalIterator __first, _BidirectionalIterator __last, _OutputIterator __result)
 {
   for (; __first != __last; ++__result)
diff --git a/libcudacxx/include/cuda/std/__algorithm/rotate.h b/libcudacxx/include/cuda/std/__algorithm/rotate.h
index dcbff1aac3..2aef1790ba 100644
--- a/libcudacxx/include/cuda/std/__algorithm/rotate.h
+++ b/libcudacxx/include/cuda/std/__algorithm/rotate.h
@@ -32,7 +32,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _ForwardIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 __rotate_left(_ForwardIterator __first, _ForwardIterator __last)
 {
   typedef typename iterator_traits<_ForwardIterator>::value_type value_type;
@@ -45,7 +45,7 @@ __rotate_left(_ForwardIterator __first, _ForwardIterator __last)
 }
 
 template <class _AlgPolicy, class _BidirectionalIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator
 __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
   typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
@@ -60,7 +60,7 @@ __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last)
 }
 
 template <class _AlgPolicy, class _ForwardIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 __rotate_forward(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last)
 {
   _ForwardIterator __i = __middle;
@@ -103,7 +103,7 @@ __rotate_forward(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIt
 }
 
 template <typename _Integral>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Integral __algo_gcd(_Integral __x, _Integral __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Integral __algo_gcd(_Integral __x, _Integral __y)
 {
   do
   {
@@ -115,7 +115,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Integral __algo_gcd(
 }
 
 template <class _AlgPolicy, typename _RandomAccessIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last)
 {
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
@@ -155,7 +155,7 @@ __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Ran
 }
 
 template <class _AlgPolicy, class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __rotate_impl(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __rotate_impl(
   _ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, _CUDA_VSTD::forward_iterator_tag)
 {
   typedef typename iterator_traits<_ForwardIterator>::value_type value_type;
@@ -170,7 +170,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __ro
 }
 
 template <class _AlgPolicy, class _BidirectionalIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator __rotate_impl(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator __rotate_impl(
   _BidirectionalIterator __first,
   _BidirectionalIterator __middle,
   _BidirectionalIterator __last,
@@ -192,7 +192,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BidirectionalIterato
 }
 
 template <class _AlgPolicy, class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __rotate_impl(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __rotate_impl(
   _RandomAccessIterator __first,
   _RandomAccessIterator __middle,
   _RandomAccessIterator __last,
@@ -215,7 +215,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
 }
 
 template <class _AlgPolicy, class _Iterator, class _Sentinel>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_Iterator, _Iterator>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Iterator, _Iterator>
 __rotate(_Iterator __first, _Iterator __middle, _Sentinel __last)
 {
   using _Ret            = pair<_Iterator, _Iterator>;
@@ -238,7 +238,7 @@ __rotate(_Iterator __first, _Iterator __middle, _Sentinel __last)
 }
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last)
 {
   return _CUDA_VSTD::__rotate<_ClassicAlgPolicy>(
diff --git a/libcudacxx/include/cuda/std/__algorithm/rotate_copy.h b/libcudacxx/include/cuda/std/__algorithm/rotate_copy.h
index 04f09748c9..2f536fc243 100644
--- a/libcudacxx/include/cuda/std/__algorithm/rotate_copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/rotate_copy.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _OutputIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 rotate_copy(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, _OutputIterator __result)
 {
   return _CUDA_VSTD::copy(__first, __middle, _CUDA_VSTD::copy(__middle, __last, __result));
diff --git a/libcudacxx/include/cuda/std/__algorithm/search.h b/libcudacxx/include/cuda/std/__algorithm/search.h
index 87de6bbc87..6ca71c8300 100644
--- a/libcudacxx/include/cuda/std/__algorithm/search.h
+++ b/libcudacxx/include/cuda/std/__algorithm/search.h
@@ -33,15 +33,14 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _BinaryPredicate, class _ForwardIterator1, class _ForwardIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator1, _ForwardIterator1>
-__search(_ForwardIterator1 __first1,
-         _ForwardIterator1 __last1,
-         _ForwardIterator2 __first2,
-         _ForwardIterator2 __last2,
-         _BinaryPredicate __pred,
-         forward_iterator_tag,
-         forward_iterator_tag)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator1, _ForwardIterator1> __search(
+  _ForwardIterator1 __first1,
+  _ForwardIterator1 __last1,
+  _ForwardIterator2 __first2,
+  _ForwardIterator2 __last2,
+  _BinaryPredicate __pred,
+  forward_iterator_tag,
+  forward_iterator_tag)
 {
   if (__first2 == __last2)
   {
@@ -85,8 +84,7 @@ __search(_ForwardIterator1 __first1,
 }
 
 template <class _BinaryPredicate, class _RandomAccessIterator1, class _RandomAccessIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 pair<_RandomAccessIterator1, _RandomAccessIterator1>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_RandomAccessIterator1, _RandomAccessIterator1>
 __search(_RandomAccessIterator1 __first1,
          _RandomAccessIterator1 __last1,
          _RandomAccessIterator2 __first2,
@@ -144,12 +142,12 @@ __search(_RandomAccessIterator1 __first1,
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator1
-search(_ForwardIterator1 __first1,
-       _ForwardIterator1 __last1,
-       _ForwardIterator2 __first2,
-       _ForwardIterator2 __last2,
-       _BinaryPredicate __pred)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator1 search(
+  _ForwardIterator1 __first1,
+  _ForwardIterator1 __last1,
+  _ForwardIterator2 __first2,
+  _ForwardIterator2 __last2,
+  _BinaryPredicate __pred)
 {
   return _CUDA_VSTD::__search<__add_lvalue_reference_t<_BinaryPredicate>>(
            __first1,
@@ -163,7 +161,7 @@ search(_ForwardIterator1 __first1,
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator1
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator1
 search(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2)
 {
   return _CUDA_VSTD::search(__first1, __last1, __first2, __last2, __equal_to{});
@@ -171,7 +169,7 @@ search(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2
 
 #if _CCCL_STD_VER > 2014
 template <class _ForwardIterator, class _Searcher>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 search(_ForwardIterator __f, _ForwardIterator __l, const _Searcher& __s)
 {
   return __s(__f, __l).first;
diff --git a/libcudacxx/include/cuda/std/__algorithm/search_n.h b/libcudacxx/include/cuda/std/__algorithm/search_n.h
index ffebc162c3..5d419e7c27 100644
--- a/libcudacxx/include/cuda/std/__algorithm/search_n.h
+++ b/libcudacxx/include/cuda/std/__algorithm/search_n.h
@@ -28,13 +28,13 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _BinaryPredicate, class _ForwardIterator, class _Size, class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
-__search_n(_ForwardIterator __first,
-           _ForwardIterator __last,
-           _Size __count,
-           const _Tp& __value_,
-           _BinaryPredicate __pred,
-           forward_iterator_tag)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __search_n(
+  _ForwardIterator __first,
+  _ForwardIterator __last,
+  _Size __count,
+  const _Tp& __value_,
+  _BinaryPredicate __pred,
+  forward_iterator_tag)
 {
   if (__count <= 0)
   {
@@ -79,14 +79,13 @@ __search_n(_ForwardIterator __first,
 }
 
 template <class _BinaryPredicate, class _RandomAccessIterator, class _Size, class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 _RandomAccessIterator
-__search_n(_RandomAccessIterator __first,
-           _RandomAccessIterator __last,
-           _Size __count,
-           const _Tp& __value_,
-           _BinaryPredicate __pred,
-           random_access_iterator_tag)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __search_n(
+  _RandomAccessIterator __first,
+  _RandomAccessIterator __last,
+  _Size __count,
+  const _Tp& __value_,
+  _BinaryPredicate __pred,
+  random_access_iterator_tag)
 {
   if (__count <= 0)
   {
@@ -134,7 +133,7 @@ __search_n(_RandomAccessIterator __first,
 }
 
 template <class _ForwardIterator, class _Size, class _Tp, class _BinaryPredicate>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 search_n(_ForwardIterator __first, _ForwardIterator __last, _Size __count, const _Tp& __value_, _BinaryPredicate __pred)
 {
   return _CUDA_VSTD::__search_n<__add_lvalue_reference_t<_BinaryPredicate>>(
@@ -147,7 +146,7 @@ search_n(_ForwardIterator __first, _ForwardIterator __last, _Size __count, const
 }
 
 template <class _ForwardIterator, class _Size, class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 search_n(_ForwardIterator __first, _ForwardIterator __last, _Size __count, const _Tp& __value_)
 {
   return _CUDA_VSTD::search_n(__first, __last, __convert_to_integral(__count), __value_, __equal_to{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/set_difference.h b/libcudacxx/include/cuda/std/__algorithm/set_difference.h
index f4a65c35ad..f20e4e021a 100644
--- a/libcudacxx/include/cuda/std/__algorithm/set_difference.h
+++ b/libcudacxx/include/cuda/std/__algorithm/set_difference.h
@@ -34,7 +34,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Comp, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<__remove_cvref_t<_InIter1>, __remove_cvref_t<_OutIter>>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<__remove_cvref_t<_InIter1>, __remove_cvref_t<_OutIter>>
 __set_difference(
   _InIter1&& __first1, _Sent1&& __last1, _InIter2&& __first2, _Sent2&& __last2, _OutIter&& __result, _Comp&& __comp)
 {
@@ -61,7 +61,7 @@ __set_difference(
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_difference(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator set_difference(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
@@ -75,7 +75,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_d
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_difference(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator set_difference(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
diff --git a/libcudacxx/include/cuda/std/__algorithm/set_intersection.h b/libcudacxx/include/cuda/std/__algorithm/set_intersection.h
index b0a395f09f..ecfe1359fb 100644
--- a/libcudacxx/include/cuda/std/__algorithm/set_intersection.h
+++ b/libcudacxx/include/cuda/std/__algorithm/set_intersection.h
@@ -37,7 +37,7 @@ struct __set_intersection_result
   _OutIter __out_;
 
   // need a constructor as C++03 aggregate init is hard
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   __set_intersection_result(_InIter1&& __in_iter1, _InIter2&& __in_iter2, _OutIter&& __out_iter)
       : __in1_(_CUDA_VSTD::move(__in_iter1))
       , __in2_(_CUDA_VSTD::move(__in_iter2))
@@ -46,7 +46,7 @@ struct __set_intersection_result
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __set_intersection_result<_InIter1, _InIter2, _OutIter>
 __set_intersection(
   _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp)
 {
@@ -75,7 +75,7 @@ __set_intersection(
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_intersection(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator set_intersection(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
@@ -94,7 +94,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_i
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_intersection(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator set_intersection(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
diff --git a/libcudacxx/include/cuda/std/__algorithm/set_symmetric_difference.h b/libcudacxx/include/cuda/std/__algorithm/set_symmetric_difference.h
index cce8d35c4d..1dfe3058a6 100644
--- a/libcudacxx/include/cuda/std/__algorithm/set_symmetric_difference.h
+++ b/libcudacxx/include/cuda/std/__algorithm/set_symmetric_difference.h
@@ -38,7 +38,7 @@ struct __set_symmetric_difference_result
   _OutIter __out_;
 
   // need a constructor as C++03 aggregate init is hard
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   __set_symmetric_difference_result(_InIter1&& __in_iter1, _InIter2&& __in_iter2, _OutIter&& __out_iter)
       : __in1_(_CUDA_VSTD::move(__in_iter1))
       , __in2_(_CUDA_VSTD::move(__in_iter2))
@@ -47,7 +47,7 @@ struct __set_symmetric_difference_result
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __set_symmetric_difference_result<_InIter1, _InIter2, _OutIter>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __set_symmetric_difference_result<_InIter1, _InIter2, _OutIter>
 __set_symmetric_difference(
   _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp)
 {
@@ -87,7 +87,7 @@ __set_symmetric_difference(
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_symmetric_difference(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator set_symmetric_difference(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
@@ -106,7 +106,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_symmetri
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_symmetric_difference(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator set_symmetric_difference(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
diff --git a/libcudacxx/include/cuda/std/__algorithm/set_union.h b/libcudacxx/include/cuda/std/__algorithm/set_union.h
index c053eb60ef..0dec1db065 100644
--- a/libcudacxx/include/cuda/std/__algorithm/set_union.h
+++ b/libcudacxx/include/cuda/std/__algorithm/set_union.h
@@ -38,7 +38,7 @@ struct __set_union_result
   _OutIter __out_;
 
   // need a constructor as C++03 aggregate init is hard
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   __set_union_result(_InIter1&& __in_iter1, _InIter2&& __in_iter2, _OutIter&& __out_iter)
       : __in1_(_CUDA_VSTD::move(__in_iter1))
       , __in2_(_CUDA_VSTD::move(__in_iter2))
@@ -47,7 +47,7 @@ struct __set_union_result
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __set_union_result<_InIter1, _InIter2, _OutIter>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __set_union_result<_InIter1, _InIter2, _OutIter>
 __set_union(_InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp)
 {
   for (; __first1 != __last1; ++__result)
@@ -81,7 +81,7 @@ __set_union(_InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_union(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator set_union(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
@@ -100,7 +100,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_union(
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator set_union(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator set_union(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
diff --git a/libcudacxx/include/cuda/std/__algorithm/shift_left.h b/libcudacxx/include/cuda/std/__algorithm/shift_left.h
index 39a41948f0..ed07159b1a 100644
--- a/libcudacxx/include/cuda/std/__algorithm/shift_left.h
+++ b/libcudacxx/include/cuda/std/__algorithm/shift_left.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_left(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_left(
   _ForwardIterator __first,
   _ForwardIterator __last,
   typename iterator_traits<_ForwardIterator>::difference_type __n,
@@ -42,7 +42,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __sh
 }
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_left(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_left(
   _ForwardIterator __first,
   _ForwardIterator __last,
   typename iterator_traits<_ForwardIterator>::difference_type __n,
@@ -61,7 +61,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __sh
 }
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator shift_left(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator shift_left(
   _ForwardIterator __first, _ForwardIterator __last, typename iterator_traits<_ForwardIterator>::difference_type __n)
 {
   if (__n == 0)
diff --git a/libcudacxx/include/cuda/std/__algorithm/shift_right.h b/libcudacxx/include/cuda/std/__algorithm/shift_right.h
index a8548b7beb..a5bb53f832 100644
--- a/libcudacxx/include/cuda/std/__algorithm/shift_right.h
+++ b/libcudacxx/include/cuda/std/__algorithm/shift_right.h
@@ -29,7 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_right(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_right(
   _ForwardIterator __first,
   _ForwardIterator __last,
   typename iterator_traits<_ForwardIterator>::difference_type __n,
@@ -45,7 +45,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __sh
 }
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_right(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_right(
   _ForwardIterator __first,
   _ForwardIterator __last,
   typename iterator_traits<_ForwardIterator>::difference_type __n,
@@ -64,7 +64,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __sh
 }
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_right(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __shift_right(
   _ForwardIterator __first,
   _ForwardIterator __last,
   typename iterator_traits<_ForwardIterator>::difference_type __n,
@@ -120,7 +120,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator __sh
 }
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator shift_right(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator shift_right(
   _ForwardIterator __first, _ForwardIterator __last, typename iterator_traits<_ForwardIterator>::difference_type __n)
 {
   if (__n == 0)
diff --git a/libcudacxx/include/cuda/std/__algorithm/sift_down.h b/libcudacxx/include/cuda/std/__algorithm/sift_down.h
index 448f63ae6e..d988081dab 100644
--- a/libcudacxx/include/cuda/std/__algorithm/sift_down.h
+++ b/libcudacxx/include/cuda/std/__algorithm/sift_down.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void __sift_down(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __sift_down(
   _RandomAccessIterator __first,
   _Compare&& __comp,
   typename iterator_traits<_RandomAccessIterator>::difference_type __len,
@@ -93,7 +93,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void __sift_down(
 }
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __floyd_sift_down(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _RandomAccessIterator __floyd_sift_down(
   _RandomAccessIterator __first,
   _Compare&& __comp,
   typename iterator_traits<_RandomAccessIterator>::difference_type __len)
diff --git a/libcudacxx/include/cuda/std/__algorithm/sort_heap.h b/libcudacxx/include/cuda/std/__algorithm/sort_heap.h
index c1243464d3..a929b3dd93 100644
--- a/libcudacxx/include/cuda/std/__algorithm/sort_heap.h
+++ b/libcudacxx/include/cuda/std/__algorithm/sort_heap.h
@@ -32,7 +32,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare&& __comp)
 {
   __comp_ref_type<_Compare> __comp_ref = __comp;
@@ -45,7 +45,7 @@ __sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compar
 }
 
 template <class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
   static_assert(_CCCL_TRAIT(is_copy_constructible, _RandomAccessIterator), "Iterators must be copy constructible.");
@@ -55,7 +55,7 @@ sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare
 }
 
 template <class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last)
 {
   _CUDA_VSTD::sort_heap(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), __less{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/swap_ranges.h b/libcudacxx/include/cuda/std/__algorithm/swap_ranges.h
index a021e506d2..18586d9abe 100644
--- a/libcudacxx/include/cuda/std/__algorithm/swap_ranges.h
+++ b/libcudacxx/include/cuda/std/__algorithm/swap_ranges.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // 2+2 iterators: the shorter size will be used.
 template <class _AlgPolicy, class _ForwardIterator1, class _Sentinel1, class _ForwardIterator2, class _Sentinel2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator1, _ForwardIterator2>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator1, _ForwardIterator2>
 __swap_ranges(_ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2 __first2, _Sentinel2 __last2)
 {
   while (__first1 != __last1 && __first2 != __last2)
@@ -44,7 +44,7 @@ __swap_ranges(_ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2
 
 // 2+1 iterators: size2 >= size1.
 template <class _AlgPolicy, class _ForwardIterator1, class _Sentinel1, class _ForwardIterator2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator1, _ForwardIterator2>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_ForwardIterator1, _ForwardIterator2>
 __swap_ranges(_ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2 __first2)
 {
   while (__first1 != __last1)
@@ -58,7 +58,7 @@ __swap_ranges(_ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator2
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator2
 swap_ranges(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2)
 {
   return _CUDA_VSTD::__swap_ranges<_ClassicAlgPolicy>(
diff --git a/libcudacxx/include/cuda/std/__algorithm/transform.h b/libcudacxx/include/cuda/std/__algorithm/transform.h
index 91f185503c..1c287957b9 100644
--- a/libcudacxx/include/cuda/std/__algorithm/transform.h
+++ b/libcudacxx/include/cuda/std/__algorithm/transform.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _UnaryOperation>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 transform(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _UnaryOperation __op)
 {
   for (; __first != __last; ++__first, (void) ++__result)
@@ -34,7 +34,7 @@ transform(_InputIterator __first, _InputIterator __last, _OutputIterator __resul
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _BinaryOperation>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator transform(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator transform(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
diff --git a/libcudacxx/include/cuda/std/__algorithm/unique.h b/libcudacxx/include/cuda/std/__algorithm/unique.h
index d6deedf135..a5c5afd588 100644
--- a/libcudacxx/include/cuda/std/__algorithm/unique.h
+++ b/libcudacxx/include/cuda/std/__algorithm/unique.h
@@ -30,7 +30,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Iter, class _Sent, class _BinaryPredicate>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _CUDA_VSTD::pair<_Iter, _Iter>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _CUDA_VSTD::pair<_Iter, _Iter>
 __unique(_Iter __first, _Sent __last, _BinaryPredicate&& __pred)
 {
   __first = _CUDA_VSTD::adjacent_find(__first, __last, __pred);
@@ -53,14 +53,14 @@ __unique(_Iter __first, _Sent __last, _BinaryPredicate&& __pred)
 }
 
 template <class _ForwardIterator, class _BinaryPredicate>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 unique(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred)
 {
   return _CUDA_VSTD::__unique<_ClassicAlgPolicy>(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), __pred).first;
 }
 
 template <class _ForwardIterator>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 unique(_ForwardIterator __first, _ForwardIterator __last)
 {
   return _CUDA_VSTD::unique(__first, __last, __equal_to{});
diff --git a/libcudacxx/include/cuda/std/__algorithm/unique_copy.h b/libcudacxx/include/cuda/std/__algorithm/unique_copy.h
index fa00a3d730..57539abfdd 100644
--- a/libcudacxx/include/cuda/std/__algorithm/unique_copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/unique_copy.h
@@ -44,7 +44,7 @@ struct __read_from_tmp_value_tag
 } // namespace __unique_copy_tags
 
 template <class _AlgPolicy, class _BinaryPredicate, class _InputIterator, class _Sent, class _OutputIterator>
-_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY pair<_InputIterator, _OutputIterator> __unique_copy(
+_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI pair<_InputIterator, _OutputIterator> __unique_copy(
   _InputIterator __first,
   _Sent __last,
   _OutputIterator __result,
@@ -70,7 +70,7 @@ _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY pair<_InputIterator, _Output
 }
 
 template <class _AlgPolicy, class _BinaryPredicate, class _ForwardIterator, class _Sent, class _OutputIterator>
-_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY pair<_ForwardIterator, _OutputIterator> __unique_copy(
+_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI pair<_ForwardIterator, _OutputIterator> __unique_copy(
   _ForwardIterator __first,
   _Sent __last,
   _OutputIterator __result,
@@ -96,7 +96,7 @@ _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY pair<_ForwardIterator, _Outp
 }
 
 template <class _AlgPolicy, class _BinaryPredicate, class _InputIterator, class _Sent, class _InputAndOutputIterator>
-_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY pair<_InputIterator, _InputAndOutputIterator> __unique_copy(
+_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI pair<_InputIterator, _InputAndOutputIterator> __unique_copy(
   _InputIterator __first,
   _Sent __last,
   _InputAndOutputIterator __result,
@@ -119,7 +119,7 @@ _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY pair<_InputIterator, _InputA
 }
 
 template <class _InputIterator, class _OutputIterator, class _BinaryPredicate>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryPredicate __pred)
 {
   using __algo_tag = __conditional_t<
@@ -135,7 +135,7 @@ unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __res
 }
 
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
   return _CUDA_VSTD::unique_copy(
diff --git a/libcudacxx/include/cuda/std/__algorithm/unwrap_iter.h b/libcudacxx/include/cuda/std/__algorithm/unwrap_iter.h
index 03e74c80dc..a97b443345 100644
--- a/libcudacxx/include/cuda/std/__algorithm/unwrap_iter.h
+++ b/libcudacxx/include/cuda/std/__algorithm/unwrap_iter.h
@@ -42,11 +42,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Iter, bool = __is_cpp17_contiguous_iterator<_Iter>::value>
 struct __unwrap_iter_impl
 {
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr _Iter __rewrap(_Iter, _Iter __iter)
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr _Iter __rewrap(_Iter, _Iter __iter)
   {
     return __iter;
   }
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr _Iter __unwrap(_Iter __i) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr _Iter __unwrap(_Iter __i) noexcept
   {
     return __i;
   }
@@ -60,12 +60,12 @@ struct __unwrap_iter_impl<_Iter, true>
 {
   using _ToAddressT = decltype(_CUDA_VSTD::__to_address(_CUDA_VSTD::declval<_Iter>()));
 
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr _Iter __rewrap(_Iter __orig_iter, _ToAddressT __unwrapped_iter)
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr _Iter __rewrap(_Iter __orig_iter, _ToAddressT __unwrapped_iter)
   {
     return __orig_iter + (__unwrapped_iter - _CUDA_VSTD::__to_address(__orig_iter));
   }
 
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr _ToAddressT __unwrap(_Iter __i) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr _ToAddressT __unwrap(_Iter __i) noexcept
   {
     return _CUDA_VSTD::__to_address(__i);
   }
@@ -76,14 +76,14 @@ struct __unwrap_iter_impl<_Iter, true>
 template <class _Iter,
           class _Impl                                             = __unwrap_iter_impl<_Iter>,
           __enable_if_t<is_copy_constructible<_Iter>::value, int> = 0>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 decltype(_Impl::__unwrap(_CUDA_VSTD::declval<_Iter>()))
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 decltype(_Impl::__unwrap(_CUDA_VSTD::declval<_Iter>()))
 __unwrap_iter(_Iter __i) noexcept
 {
   return _Impl::__unwrap(__i);
 }
 
 template <class _OrigIter, class _Iter, class _Impl = __unwrap_iter_impl<_OrigIter>>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _OrigIter __rewrap_iter(_OrigIter __orig_iter, _Iter __iter) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _OrigIter __rewrap_iter(_OrigIter __orig_iter, _Iter __iter) noexcept
 {
   return _Impl::__rewrap(_CUDA_VSTD::move(__orig_iter), _CUDA_VSTD::move(__iter));
 }
diff --git a/libcudacxx/include/cuda/std/__algorithm/unwrap_range.h b/libcudacxx/include/cuda/std/__algorithm/unwrap_range.h
index 04cd9abc63..dfe61f4f1e 100644
--- a/libcudacxx/include/cuda/std/__algorithm/unwrap_range.h
+++ b/libcudacxx/include/cuda/std/__algorithm/unwrap_range.h
@@ -39,8 +39,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Iter, class _Sent>
 struct __unwrap_range_impl
 {
-  inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
-  __unwrap(_Iter __first, _Sent __sent)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __unwrap(_Iter __first, _Sent __sent)
     requires random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>
   {
     auto __last = ranges::next(__first, __sent);
@@ -48,21 +47,19 @@ struct __unwrap_range_impl
                 _CUDA_VSTD::__unwrap_iter(_CUDA_VSTD::move(__last))};
   }
 
-  inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
-  __unwrap(_Iter __first, _Sent __last)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __unwrap(_Iter __first, _Sent __last)
   {
     return pair{_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last)};
   }
 
-  inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto
   __rewrap(_Iter __orig_iter, decltype(_CUDA_VSTD::__unwrap_iter(_CUDA_VSTD::move(__orig_iter))) __iter)
     requires random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>
   {
     return _CUDA_VSTD::__rewrap_iter(_CUDA_VSTD::move(__orig_iter), _CUDA_VSTD::move(__iter));
   }
 
-  inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
-  __rewrap(const _Iter&, _Iter __iter)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __rewrap(const _Iter&, _Iter __iter)
     requires(!(random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) )
   {
     return __iter;
@@ -72,14 +69,13 @@ struct __unwrap_range_impl
 template <class _Iter>
 struct __unwrap_range_impl<_Iter, _Iter>
 {
-  inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
-  __unwrap(_Iter __first, _Iter __last)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __unwrap(_Iter __first, _Iter __last)
   {
     return pair{_CUDA_VSTD::__unwrap_iter(_CUDA_VSTD::move(__first)),
                 _CUDA_VSTD::__unwrap_iter(_CUDA_VSTD::move(__last))};
   }
 
-  inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto
   __rewrap(_Iter __orig_iter, decltype(_CUDA_VSTD::__unwrap_iter(__orig_iter)) __iter)
   {
     return _CUDA_VSTD::__rewrap_iter(_CUDA_VSTD::move(__orig_iter), _CUDA_VSTD::move(__iter));
@@ -87,29 +83,26 @@ struct __unwrap_range_impl<_Iter, _Iter>
 };
 
 template <class _Iter, class _Sent>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto __unwrap_range(_Iter __first, _Sent __last)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto __unwrap_range(_Iter __first, _Sent __last)
 {
   return __unwrap_range_impl<_Iter, _Sent>::__unwrap(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last));
 }
 
 template <class _Sent, class _Iter, class _Unwrapped>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Iter
-__rewrap_range(_Iter __orig_iter, _Unwrapped __iter)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Iter __rewrap_range(_Iter __orig_iter, _Unwrapped __iter)
 {
   return __unwrap_range_impl<_Iter, _Sent>::__rewrap(_CUDA_VSTD::move(__orig_iter), _CUDA_VSTD::move(__iter));
 }
 #else // ^^^ C++20 ^^^ / vvv C++17 vvv
 template <class _Iter, class _Unwrapped = decltype(_CUDA_VSTD::__unwrap_iter(_CUDA_VSTD::declval<_Iter>()))>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair<_Unwrapped, _Unwrapped>
-__unwrap_range(_Iter __first, _Iter __last)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Unwrapped, _Unwrapped> __unwrap_range(_Iter __first, _Iter __last)
 {
   return _CUDA_VSTD::make_pair(
     _CUDA_VSTD::__unwrap_iter(_CUDA_VSTD::move(__first)), _CUDA_VSTD::__unwrap_iter(_CUDA_VSTD::move(__last)));
 }
 
 template <class _Iter, class _Unwrapped>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Iter
-__rewrap_range(_Iter __orig_iter, _Unwrapped __iter)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Iter __rewrap_range(_Iter __orig_iter, _Unwrapped __iter)
 {
   return _CUDA_VSTD::__rewrap_iter(_CUDA_VSTD::move(__orig_iter), _CUDA_VSTD::move(__iter));
 }
diff --git a/libcudacxx/include/cuda/std/__algorithm/upper_bound.h b/libcudacxx/include/cuda/std/__algorithm/upper_bound.h
index 809eb5d313..125affef74 100644
--- a/libcudacxx/include/cuda/std/__algorithm/upper_bound.h
+++ b/libcudacxx/include/cuda/std/__algorithm/upper_bound.h
@@ -34,7 +34,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Compare, class _Iter, class _Sent, class _Tp, class _Proj>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Iter
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Iter
 __upper_bound(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp, _Proj&& __proj)
 {
   auto __len = _IterOps<_AlgPolicy>::distance(__first, __last);
@@ -56,7 +56,7 @@ __upper_bound(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp
 }
 
 template <class _ForwardIterator, class _Tp, class _Compare>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 upper_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp)
 {
   static_assert(is_copy_constructible<_ForwardIterator>::value, "Iterator has to be copy constructible");
@@ -65,7 +65,7 @@ upper_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __valu
 }
 
 template <class _ForwardIterator, class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _ForwardIterator
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 upper_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value)
 {
   return _CUDA_VSTD::upper_bound(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last), __value, __less{});
diff --git a/libcudacxx/include/cuda/std/__atomic/api/owned.h b/libcudacxx/include/cuda/std/__atomic/api/owned.h
index fdbc8baac2..4ffb27fd39 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/owned.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/owned.h
@@ -34,11 +34,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <typename _Tp, typename _Sco>
 struct __atomic_common
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_common(_Tp __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_common(_Tp __v)
       : __a(__v)
   {}
 
-  constexpr inline __atomic_common() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __atomic_common() = default;
 
   __atomic_storage_t<_Tp> __a;
 
@@ -53,11 +53,11 @@ struct __atomic_common
 template <typename _Tp, typename _Sco>
 struct __atomic_arithmetic
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_arithmetic(_Tp __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_arithmetic(_Tp __v)
       : __a(__v)
   {}
 
-  constexpr inline __atomic_arithmetic() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __atomic_arithmetic() = default;
 
   __atomic_storage_t<_Tp> __a;
 
@@ -75,11 +75,11 @@ struct __atomic_arithmetic
 template <typename _Tp, typename _Sco>
 struct __atomic_bitwise
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_bitwise(_Tp __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_bitwise(_Tp __v)
       : __a(__v)
   {}
 
-  constexpr inline __atomic_bitwise() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __atomic_bitwise() = default;
 
   __atomic_storage_t<_Tp> __a;
 
@@ -100,11 +100,11 @@ struct __atomic_bitwise
 template <typename _Tp, typename _Sco>
 struct __atomic_pointer
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_pointer(_Tp __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_pointer(_Tp __v)
       : __a(__v)
   {}
 
-  constexpr inline __atomic_pointer() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __atomic_pointer() = default;
 
   __atomic_storage_t<_Tp> __a;
 
diff --git a/libcudacxx/include/cuda/std/__atomic/api/reference.h b/libcudacxx/include/cuda/std/__atomic/api/reference.h
index eeba3a6746..e59fc94479 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/reference.h
@@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <typename _Tp, typename _Sco>
 struct __atomic_ref_common
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_common(_Tp& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_ref_common(_Tp& __v)
       : __a(&__v)
   {}
 
@@ -50,7 +50,7 @@ struct __atomic_ref_common
 template <typename _Tp, typename _Sco>
 struct __atomic_ref_arithmetic
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_arithmetic(_Tp& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_ref_arithmetic(_Tp& __v)
       : __a(&__v)
   {}
 
@@ -67,7 +67,7 @@ struct __atomic_ref_arithmetic
 template <typename _Tp, typename _Sco>
 struct __atomic_ref_bitwise
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_bitwise(_Tp& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_ref_bitwise(_Tp& __v)
       : __a(&__v)
   {}
 
@@ -85,7 +85,7 @@ struct __atomic_ref_bitwise
 template <typename _Tp, typename _Sco>
 struct __atomic_ref_pointer
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_pointer(_Tp& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_ref_pointer(_Tp& __v)
       : __a(&__v)
   {}
 
diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index fa4b9d7931..7683a185ec 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -40,7 +40,7 @@ struct __atomic_storage
 
   _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
 
-  constexpr explicit __atomic_storage() noexcept = default;
+  _CCCL_HIDE_FROM_ABI explicit constexpr __atomic_storage() noexcept = default;
 
   _CCCL_HOST_DEVICE constexpr explicit inline __atomic_storage(_Tp value) noexcept
       : __a_value(value)
diff --git a/libcudacxx/include/cuda/std/__atomic/types/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
index e974de913d..5538abcce6 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -39,7 +39,7 @@ struct __atomic_locked_storage
   _Tp __a_value;
   mutable __atomic_storage<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a_lock;
 
-  explicit constexpr __atomic_locked_storage() noexcept = default;
+  _CCCL_HIDE_FROM_ABI explicit constexpr __atomic_locked_storage() noexcept = default;
 
   _CCCL_HOST_DEVICE constexpr explicit inline __atomic_locked_storage(_Tp value) noexcept
       : __a_value(value)
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
index 29130ee244..f7dc51987a 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__();
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 __atomic_try_wait_slow(_Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> __val, memory_order __order, _Sco)
 {
   NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});
@@ -39,19 +39,19 @@ __atomic_try_wait_slow(_Tp const volatile* __a, __atomic_underlying_remove_cv_t<
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_one(_Tp const volatile*, _Sco)
+_LIBCUDACXX_HIDE_FROM_ABI void __atomic_notify_one(_Tp const volatile*, _Sco)
 {
   NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, , NV_IS_HOST, , NV_ANY_TARGET, __atomic_try_wait_unsupported_before_SM_70__(););
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_all(_Tp const volatile*, _Sco)
+_LIBCUDACXX_HIDE_FROM_ABI void __atomic_notify_all(_Tp const volatile*, _Sco)
 {
   NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, , NV_IS_HOST, , NV_ANY_TARGET, __atomic_try_wait_unsupported_before_SM_70__(););
 }
 
 template <typename _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs)
 {
 #if defined(_CCCL_CUDA_COMPILER)
   return __lhs == __rhs;
@@ -61,7 +61,7 @@ _LIBCUDACXX_INLINE_VISIBILITY bool __nonatomic_compare_equal(_Tp const& __lhs, _
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(
+_LIBCUDACXX_HIDE_FROM_ABI void __atomic_wait(
   _Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> const __val, memory_order __order, _Sco = {})
 {
   for (int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i)
diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h
new file mode 100644
index 0000000000..6fd9cdb153
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/bit_cast.h
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___BIT_BIT_CAST_H
+#define _LIBCUDACXX___BIT_BIT_CAST_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
+#include <cuda/std/__type_traits/is_trivially_default_constructible.h>
+#include <cuda/std/detail/libcxx/include/cstring>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#if defined(_LIBCUDACXX_BIT_CAST)
+#  define _LIBCUDACXX_CONSTEXPR_BIT_CAST constexpr
+#else // ^^^ _LIBCUDACXX_BIT_CAST ^^^ / vvv !_LIBCUDACXX_BIT_CAST vvv
+#  define _LIBCUDACXX_CONSTEXPR_BIT_CAST
+#endif // !_LIBCUDACXX_BIT_CAST
+
+template <class _To,
+          class _From,
+          __enable_if_t<(sizeof(_To) == sizeof(_From)), int>            = 0,
+          __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _To), int>   = 0,
+          __enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _From), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept
+{
+#if defined(_LIBCUDACXX_BIT_CAST)
+  return _LIBCUDACXX_BIT_CAST(_To, __from);
+#else // ^^^ _LIBCUDACXX_BIT_CAST ^^^ / vvv !_LIBCUDACXX_BIT_CAST vvv
+  static_assert(_CCCL_TRAIT(is_trivially_default_constructible, _To),
+                "The compiler does not support __builtin_bit_cast, so bit_cast additionally requires the destination "
+                "type to be trivially constructible");
+  _To __temp;
+  _CUDA_VSTD::memcpy(&__temp, &__from, sizeof(_To));
+  return __temp;
+#endif // !_LIBCUDACXX_BIT_CAST
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___BIT_BIT_CAST_H
diff --git a/libcudacxx/include/cuda/std/__bit/clz.h b/libcudacxx/include/cuda/std/__bit/clz.h
index 84dbcd686a..0af2fd67c5 100644
--- a/libcudacxx/include/cuda/std/__bit/clz.h
+++ b/libcudacxx/include/cuda/std/__bit/clz.h
@@ -26,38 +26,38 @@
 
 #if defined(_CCCL_COMPILER_MSVC)
 #  include <intrin.h>
-#endif
+#endif // _CCCL_COMPILER_MSVC
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz2(uint64_t __x, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz2(uint64_t __x, int __c)
 {
   return !!(~__x & 0x2) ^ __c;
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz4(uint64_t __x, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz4(uint64_t __x, int __c)
 {
   return __binary_clz2(__x >> 2 * !!(__x & 0xC), __c + 2 * !(__x & 0xC));
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz8(uint64_t __x, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz8(uint64_t __x, int __c)
 {
   return __binary_clz4(__x >> 4 * !!(__x & 0xF0), __c + 4 * !(__x & 0xF0));
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz16(uint64_t __x, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz16(uint64_t __x, int __c)
 {
   return __binary_clz8(__x >> 8 * !!(__x & 0xFF00), __c + 8 * !(__x & 0xFF00));
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz32(uint64_t __x, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz32(uint64_t __x, int __c)
 {
   return __binary_clz16(__x >> 16 * !!(__x & 0xFFFF0000), __c + 16 * !(__x & 0xFFFF0000));
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_clz64(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz64(uint64_t __x)
 {
   return __binary_clz32(__x >> 32 * !!(__x & 0xFFFFFFFF00000000), 32 * !(__x & 0xFFFFFFFF00000000));
 }
 
 #if !defined(_CCCL_COMPILER_MSVC)
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_clz(uint32_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_clz(uint32_t __x) noexcept
 {
 #  if defined(__CUDA_ARCH__)
   return __binary_clz32(static_cast<uint64_t>(__x), 0); // no device constexpr builtins
@@ -66,7 +66,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_clz(uint32_t __x)
 #  endif
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_clz(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_clz(uint64_t __x) noexcept
 {
 #  if defined(__CUDA_ARCH__)
   return __binary_clz64(__x); // no device constexpr builtins
@@ -75,7 +75,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_clz(uint64_t __x)
 #  endif
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint32_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
   if (!__libcpp_default_is_constant_evaluated())
@@ -86,7 +86,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint32_t __x) no
   return __constexpr_clz(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
   if (!__libcpp_default_is_constant_evaluated())
@@ -100,7 +100,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint64_t __x) no
 #else // defined(_CCCL_COMPILER_MSVC)
 
 // Precondition:  __x != 0
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint32_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x)
 {
 #  if !defined(__CUDA_ARCH__)
   if (!__libcpp_default_is_constant_evaluated())
@@ -117,7 +117,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint32_t __x)
   return __binary_clz32(static_cast<uint64_t>(__x), 0);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_clz(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x)
 {
 #  if !defined(__CUDA_ARCH__)
   if (!__libcpp_default_is_constant_evaluated())
diff --git a/libcudacxx/include/cuda/std/__bit/countl.h b/libcudacxx/include/cuda/std/__bit/countl.h
new file mode 100644
index 0000000000..3fc32fddd8
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/countl.h
@@ -0,0 +1,120 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___BIT_COUNTL_H
+#define _LIBCUDACXX___BIT_COUNTL_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__bit/clz.h>
+#include <cuda/std/__bit/rotate.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// Forward decl for recursive use in split word operations
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept;
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
+__countl_zero_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_clz(static_cast<uint32_t>(__t)) - (numeric_limits<uint32_t>::digits - numeric_limits<_Tp>::digits);
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
+__countl_zero_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_clz(static_cast<uint64_t>(__t)) - (numeric_limits<uint64_t>::digits - numeric_limits<_Tp>::digits);
+}
+
+template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
+struct __countl_zero_rotl_impl
+{
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __short_circuit(_Tp __t, int __cur)
+  {
+    // This stops processing early if the current word is not empty
+    return (__cur == numeric_limits<uint64_t>::digits)
+           ? __cur + __countl_zero_rotl_impl<_Tp, _St - 1>::__count(__t)
+           : __cur;
+  }
+
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_iter(_Tp __t)
+  {
+    // After rotating pass result of clz to another step for processing
+    return __short_circuit(__t, __countl_zero(static_cast<uint64_t>(__t)));
+  }
+
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t)
+  {
+    return __countl_iter(__rotl(__t, numeric_limits<uint64_t>::digits));
+  }
+};
+
+template <typename _Tp>
+struct __countl_zero_rotl_impl<_Tp, 1>
+{
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t)
+  {
+    return __countl_zero(static_cast<uint64_t>(__rotl(__t, numeric_limits<uint64_t>::digits)));
+  }
+};
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
+__countl_zero_dispatch(_Tp __t) noexcept
+{
+  return __countl_zero_rotl_impl<_Tp>::__count(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned");
+  return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_one(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned");
+  return __t != numeric_limits<_Tp>::max() ? __countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+countl_zero(_Tp __t) noexcept
+{
+  return __countl_zero(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+countl_one(_Tp __t) noexcept
+{
+  return __countl_one(__t);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___BIT_COUNTL_H
diff --git a/libcudacxx/include/cuda/std/__bit/countr.h b/libcudacxx/include/cuda/std/__bit/countr.h
new file mode 100644
index 0000000000..e014c65cfc
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/countr.h
@@ -0,0 +1,114 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___BIT_COUNTR_H
+#define _LIBCUDACXX___BIT_COUNTR_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__bit/ctz.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// Forward decl for recursive use in split word operations
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_zero(_Tp __t) noexcept;
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
+__countr_zero_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_ctz(static_cast<uint32_t>(__t));
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
+__countr_zero_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_ctz(static_cast<uint64_t>(__t));
+}
+
+template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
+struct __countr_zero_rsh_impl
+{
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __short_circuit(_Tp __t, int __cur, int __count)
+  {
+    // Stops processing early if non-zero
+    return (__cur == numeric_limits<uint64_t>::digits)
+           ? __countr_zero_rsh_impl<_Tp, _St - 1>::__count(__t, __cur + __count)
+           : __cur + __count;
+  }
+
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t, int __count)
+  {
+    return __short_circuit(__t >> numeric_limits<uint64_t>::digits, __countr_zero(static_cast<uint64_t>(__t)), __count);
+  }
+};
+
+template <typename _Tp>
+struct __countr_zero_rsh_impl<_Tp, 1>
+{
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t, int __count)
+  {
+    return __count + __countr_zero(static_cast<uint64_t>(__t));
+  }
+};
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
+__countr_zero_dispatch(_Tp __t) noexcept
+{
+  return __countr_zero_rsh_impl<_Tp>::__count(__t, 0);
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_zero(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned");
+
+  return __t ? __countr_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_one(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned");
+  return __t != numeric_limits<_Tp>::max() ? __countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+countr_zero(_Tp __t) noexcept
+{
+  return __countr_zero(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+countr_one(_Tp __t) noexcept
+{
+  return __countr_one(__t);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___BIT_COUNTR_H
diff --git a/libcudacxx/include/cuda/std/__bit/ctz.h b/libcudacxx/include/cuda/std/__bit/ctz.h
index 4715386921..f4a6799819 100644
--- a/libcudacxx/include/cuda/std/__bit/ctz.h
+++ b/libcudacxx/include/cuda/std/__bit/ctz.h
@@ -26,38 +26,38 @@
 
 #if defined(_CCCL_COMPILER_MSVC)
 #  include <intrin.h>
-#endif
+#endif // _CCCL_COMPILER_MSVC
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz2(uint64_t __x, int __c) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_ctz2(uint64_t __x, int __c) noexcept
 {
   return __c + !(__x & 0x1);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz4(uint64_t __x, int __c) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_ctz4(uint64_t __x, int __c) noexcept
 {
   return __binary_ctz2(__x >> 2 * !(__x & 0x3), __c + 2 * !(__x & 0x3));
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz8(uint64_t __x, int __c) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_ctz8(uint64_t __x, int __c) noexcept
 {
   return __binary_ctz4(__x >> 4 * !(__x & 0x0F), __c + 4 * !(__x & 0x0F));
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz16(uint64_t __x, int __c) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_ctz16(uint64_t __x, int __c) noexcept
 {
   return __binary_ctz8(__x >> 8 * !(__x & 0x00FF), __c + 8 * !(__x & 0x00FF));
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz32(uint64_t __x, int __c) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_ctz32(uint64_t __x, int __c) noexcept
 {
   return __binary_ctz16(__x >> 16 * !(__x & 0x0000FFFF), __c + 16 * !(__x & 0x0000FFFF));
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __binary_ctz64(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_ctz64(uint64_t __x) noexcept
 {
   return __binary_ctz32(__x >> 32 * !(__x & 0x00000000FFFFFFFF), 32 * !(__x & 0x00000000FFFFFFFF));
 }
 
 #if !defined(_CCCL_COMPILER_MSVC)
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_ctz(uint32_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_ctz(uint32_t __x) noexcept
 {
 #  if defined(__CUDA_ARCH__)
   return __binary_ctz32(static_cast<uint64_t>(__x), 0); // no device constexpr builtins
@@ -66,7 +66,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_ctz(uint32_t __x)
 #  endif
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_ctz(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_ctz(uint64_t __x) noexcept
 {
 #  if defined(__CUDA_ARCH__)
   return __binary_ctz64(__x); // no device constexpr builtins
@@ -75,7 +75,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_ctz(uint64_t __x)
 #  endif
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint32_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
   if (!__libcpp_default_is_constant_evaluated())
@@ -87,7 +87,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint32_t __x) no
   return __constexpr_ctz(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
   if (!__libcpp_default_is_constant_evaluated())
@@ -102,7 +102,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint64_t __x) no
 #else // defined(_CCCL_COMPILER_MSVC)
 
 // Precondition:  __x != 0
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint32_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x)
 {
 #  if !defined(__CUDA_ARCH__)
   if (!__libcpp_default_is_constant_evaluated())
@@ -119,7 +119,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint32_t __x)
   return __binary_ctz32(static_cast<uint64_t>(__x), 0);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_ctz(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x)
 {
 #  if !defined(__CUDA_ARCH__)
   if (!__libcpp_default_is_constant_evaluated())
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/expected b/libcudacxx/include/cuda/std/__bit/endian.h
similarity index 51%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/expected
rename to libcudacxx/include/cuda/std/__bit/endian.h
index 14c34ed953..2dca05e926 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/expected
+++ b/libcudacxx/include/cuda/std/__bit/endian.h
@@ -1,23 +1,15 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX_EXPECTED
-#define _LIBCUDACXX_EXPECTED
-
-/*
-expected synopsis
-namespace std {
-
-}
-
-*/
+#ifndef _LIBCUDACXX___BIT_ENDIAN_H
+#define _LIBCUDACXX___BIT_ENDIAN_H
 
 #include <cuda/std/detail/__config>
 
@@ -29,10 +21,21 @@ namespace std {
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__expected/bad_expected_access.h>
-#include <cuda/std/__expected/expected.h>
-#include <cuda/std/__expected/unexpect.h>
-#include <cuda/std/__expected/unexpected.h>
-#include <cuda/std/version>
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+enum class endian
+{
+  little = 0xDEAD,
+  big    = 0xFACE,
+#if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+  native = little
+#elif defined(_LIBCUDACXX_BIG_ENDIAN)
+  native = big
+#else
+  native = 0xCAFE
+#endif
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _LIBCUDACXX_EXPECTED
+#endif // _LIBCUDACXX___BIT_ENDIAN_H
diff --git a/libcudacxx/include/cuda/std/__bit/has_single_bit.h b/libcudacxx/include/cuda/std/__bit/has_single_bit.h
new file mode 100644
index 0000000000..28088d9fdf
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/has_single_bit.h
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___BIT_HAS_SINGLE_BIT_H
+#define _LIBCUDACXX___BIT_HAS_SINGLE_BIT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __has_single_bit(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned");
+  return __t != 0 && (((__t & (__t - 1)) == 0));
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, bool>
+has_single_bit(_Tp __t) noexcept
+{
+  return __has_single_bit(__t);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___BIT_HAS_SINGLE_BIT_H
diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
new file mode 100644
index 0000000000..33ed569f7b
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___BIT_INTEGRAL_H
+#define _LIBCUDACXX___BIT_INTEGRAL_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__bit/countl.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
+  return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
+{
+  return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
+{
+  return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
+                        + (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
+                >> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+bit_floor(_Tp __t) noexcept
+{
+  return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+bit_ceil(_Tp __t) noexcept
+{
+  return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+bit_width(_Tp __t) noexcept
+{
+  return __t == 0 ? 0 : static_cast<_Tp>(__bit_log2(__t) + 1);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___BIT_INTEGRAL_H
diff --git a/libcudacxx/include/cuda/std/__bit/popc.h b/libcudacxx/include/cuda/std/__bit/popc.h
index 23b24a2bb0..e0cb652863 100644
--- a/libcudacxx/include/cuda/std/__bit/popc.h
+++ b/libcudacxx/include/cuda/std/__bit/popc.h
@@ -26,30 +26,30 @@
 
 #if defined(_CCCL_COMPILER_MSVC)
 #  include <intrin.h>
-#endif
+#endif // _CCCL_COMPILER_MSVC
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc8(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __fallback_popc8(uint64_t __x)
 {
   return static_cast<int>((__x * 0x0101010101010101) >> 56);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc16(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __fallback_popc16(uint64_t __x)
 {
   return __fallback_popc8((__x + (__x >> 4)) & 0x0f0f0f0f0f0f0f0f);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc32(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __fallback_popc32(uint64_t __x)
 {
   return __fallback_popc16((__x & 0x3333333333333333) + ((__x >> 2) & 0x3333333333333333));
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __fallback_popc64(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __fallback_popc64(uint64_t __x)
 {
   return __fallback_popc32(__x - ((__x >> 1) & 0x5555555555555555));
 }
 
 #if !defined(_CCCL_COMPILER_MSVC)
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_popcount(uint32_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_popcount(uint32_t __x) noexcept
 {
 #  if defined(__CUDA_ARCH__)
   return __fallback_popc64(static_cast<uint64_t>(__x)); // no device constexpr builtins
@@ -58,7 +58,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_popcount(uint32_t
 #  endif
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_popcount(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_popcount(uint64_t __x) noexcept
 {
 #  if defined(__CUDA_ARCH__)
   return __fallback_popc64(static_cast<uint64_t>(__x)); // no device constexpr builtins
@@ -67,7 +67,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __constexpr_popcount(uint64_t
 #  endif
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint32_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
   if (!__libcpp_default_is_constant_evaluated())
@@ -78,7 +78,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint32_t __x) n
   return __constexpr_popcount(static_cast<uint64_t>(__x));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
   if (!__libcpp_default_is_constant_evaluated())
@@ -91,7 +91,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint64_t __x) n
 
 #else // defined(_CCCL_COMPILER_MSVC)
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint32_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x)
 {
   if (!__libcpp_default_is_constant_evaluated())
   {
@@ -101,7 +101,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint32_t __x)
   return __fallback_popc64(static_cast<uint64_t>(__x));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __libcpp_popc(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x)
 {
   if (!__libcpp_default_is_constant_evaluated())
   {
diff --git a/libcudacxx/include/cuda/std/__bit/popcount.h b/libcudacxx/include/cuda/std/__bit/popcount.h
new file mode 100644
index 0000000000..fbbe1efa3c
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/popcount.h
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___BIT_POPCOUNT_H
+#define _LIBCUDACXX___BIT_POPCOUNT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__bit/popc.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
+__popcount_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_popc(static_cast<uint32_t>(__t));
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
+__popcount_dispatch(_Tp __t) noexcept
+{
+  return __libcpp_popc(static_cast<uint64_t>(__t));
+}
+
+template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
+struct __popcount_rsh_impl
+{
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t)
+  {
+    return __popcount_rsh_impl<_Tp, _St - 1>::__count(__t >> numeric_limits<uint64_t>::digits)
+         + __libcpp_popc(static_cast<uint64_t>(__t));
+  }
+};
+
+template <typename _Tp>
+struct __popcount_rsh_impl<_Tp, 1>
+{
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t)
+  {
+    return __libcpp_popc(static_cast<uint64_t>(__t));
+  }
+};
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
+__popcount_dispatch(_Tp __t) noexcept
+{
+  return __popcount_rsh_impl<_Tp>::__count(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __popcount(_Tp __t) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__libcpp_popcount requires unsigned");
+
+  return __popcount_dispatch(__t);
+}
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+popcount(_Tp __t) noexcept
+{
+  return __popcount(__t);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___BIT_POPCOUNT_H
diff --git a/libcudacxx/include/cuda/std/__bit/reference.h b/libcudacxx/include/cuda/std/__bit/reference.h
index 29482f7e25..85ca3dab05 100644
--- a/libcudacxx/include/cuda/std/__bit/reference.h
+++ b/libcudacxx/include/cuda/std/__bit/reference.h
@@ -23,11 +23,8 @@
 #include <cuda/std/__algorithm/copy_n.h>
 #include <cuda/std/__algorithm/fill_n.h>
 #include <cuda/std/__algorithm/min.h>
-// TODO: modularize bit a bit
-#include <cuda/std/bit>
-// #include <cuda/std/__bit/countr.h>
-// #include <cuda/std/__bit/invert_if.h>
-// #include <cuda/std/__bit/popcount.h>
+#include <cuda/std/__bit/countr.h>
+#include <cuda/std/__bit/popcount.h>
 #include <cuda/std/__iterator/iterator_traits.h>
 #include <cuda/std/__memory/construct_at.h>
 #include <cuda/std/__memory/pointer_traits.h>
@@ -62,19 +59,18 @@ class __bit_reference
 public:
   using __container = typename _Cp::__self;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_reference(const __bit_reference&) = default;
+  _CCCL_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_reference(const __bit_reference&) = default;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 operator bool() const noexcept
   {
     return static_cast<bool>(*__seg_ & __mask_);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator~() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator~() const noexcept
   {
     return !static_cast<bool>(*this);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_reference&
-  operator=(bool __x) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_reference& operator=(bool __x) noexcept
   {
     if (__x)
     {
@@ -88,8 +84,7 @@ class __bit_reference
   }
 
 #if _CCCL_STD_VER >= 2023
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const __bit_reference&
-  operator=(bool __x) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const __bit_reference& operator=(bool __x) const noexcept
   {
     if (__x)
     {
@@ -103,23 +98,21 @@ class __bit_reference
   }
 #endif // C++23+
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_reference&
-  operator=(const __bit_reference& __x) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_reference& operator=(const __bit_reference& __x) noexcept
   {
     return operator=(static_cast<bool>(__x));
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void flip() noexcept
   {
     *__seg_ ^= __mask_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false>
-  operator&() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> operator&() const noexcept
   {
     return __bit_iterator<_Cp, false>(__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__mask_)));
   }
 
-  friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  friend _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
   swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) noexcept
   {
     bool __t = __x;
@@ -128,7 +121,7 @@ class __bit_reference
   }
 
   template <class _Dp>
-  friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  friend _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
   swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) noexcept
   {
     bool __t = __x;
@@ -136,16 +129,14 @@ class __bit_reference
     __y      = __t;
   }
 
-  friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-  swap(__bit_reference<_Cp> __x, bool& __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void swap(__bit_reference<_Cp> __x, bool& __y) noexcept
   {
     bool __t = __x;
     __x      = __y;
     __y      = __t;
   }
 
-  friend inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-  swap(bool& __x, __bit_reference<_Cp> __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void swap(bool& __x, __bit_reference<_Cp> __y) noexcept
   {
     bool __t = __x;
     __x      = __y;
@@ -153,7 +144,7 @@ class __bit_reference
   }
 
 private:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit __bit_reference(__storage_pointer __s, __storage_type __m) noexcept
       : __seg_(__s)
       , __mask_(__m)
@@ -175,28 +166,25 @@ class __bit_const_reference
 public:
   using __container = typename _Cp::__self;
 
-  _LIBCUDACXX_HIDE_FROM_ABI __bit_const_reference(const __bit_const_reference&) = default;
+  _CCCL_HIDE_FROM_ABI __bit_const_reference(const __bit_const_reference&) = default;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept
       : __seg_(__x.__seg_)
       , __mask_(__x.__mask_)
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator bool() const noexcept
   {
     return static_cast<bool>(*__seg_ & __mask_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, true>
-  operator&() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, true> operator&() const noexcept
   {
     return __bit_iterator<_Cp, true>(__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__mask_)));
   }
 
 private:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __bit_const_reference(
-    __storage_pointer __s, __storage_type __m) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __bit_const_reference(__storage_pointer __s, __storage_type __m) noexcept
       : __seg_(__s)
       , __mask_(__m)
   {}
@@ -207,7 +195,7 @@ class __bit_const_reference
 // fill_n
 
 template <bool _FillVal, class _Cp>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __fill_n_impl(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
 {
   using _It            = __bit_iterator<_Cp, false>;
@@ -252,7 +240,7 @@ __fill_n_impl(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
 }
 
 template <class _Cp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value)
 {
   if (__n > 0)
@@ -271,7 +259,7 @@ fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __v
 // fill
 
 template <class _Cp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value)
 {
   _CUDA_VSTD::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value);
@@ -280,10 +268,8 @@ fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool
 // copy
 
 template <class _Cp, bool _IsConst>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
-__copy_aligned(__bit_iterator<_Cp, _IsConst> __first,
-               __bit_iterator<_Cp, _IsConst> __last,
-               __bit_iterator<_Cp, false> __result)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> __copy_aligned(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   using _In             = __bit_iterator<_Cp, _IsConst>;
   using difference_type = typename _In::difference_type;
@@ -329,8 +315,7 @@ __copy_aligned(__bit_iterator<_Cp, _IsConst> __first,
 }
 
 template <class _Cp, bool _IsConst>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
-__copy_unaligned(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> __copy_unaligned(
   __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   using _In             = __bit_iterator<_Cp, _IsConst>;
@@ -412,7 +397,7 @@ __copy_unaligned(
 }
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false>
 copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   if (__first.__ctz_ == __result.__ctz_)
@@ -425,8 +410,7 @@ copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last
 // copy_backward
 
 template <class _Cp, bool _IsConst>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
-__copy_backward_aligned(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> __copy_backward_aligned(
   __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   using _In             = __bit_iterator<_Cp, _IsConst>;
@@ -500,8 +484,7 @@ __copy_backward_aligned(
 }
 
 template <class _Cp, bool _IsConst>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
-__copy_backward_unaligned(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> __copy_backward_unaligned(
   __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   using _In             = __bit_iterator<_Cp, _IsConst>;
@@ -613,10 +596,8 @@ __copy_backward_unaligned(
 }
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false>
-copy_backward(__bit_iterator<_Cp, _IsConst> __first,
-              __bit_iterator<_Cp, _IsConst> __last,
-              __bit_iterator<_Cp, false> __result)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> copy_backward(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   if (__last.__ctz_ == __result.__ctz_)
   {
@@ -628,7 +609,7 @@ copy_backward(__bit_iterator<_Cp, _IsConst> __first,
 // move
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
+_LIBCUDACXX_HIDE_FROM_ABI __bit_iterator<_Cp, false>
 move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   return _CUDA_VSTD::copy(__first, __last, __result);
@@ -637,7 +618,7 @@ move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last
 // move_backward
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move_backward(
+_LIBCUDACXX_HIDE_FROM_ABI __bit_iterator<_Cp, false> move_backward(
   __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
   return _CUDA_VSTD::copy_backward(__first, __last, __result);
@@ -646,7 +627,7 @@ inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_C
 // swap_ranges
 
 template <class _Cl, class _Cr>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cr, false> __swap_ranges_aligned(
+_LIBCUDACXX_HIDE_FROM_ABI __bit_iterator<_Cr, false> __swap_ranges_aligned(
   __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result)
 {
   using _I1             = __bit_iterator<_Cl, false>;
@@ -698,7 +679,7 @@ inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_C
 }
 
 template <class _Cl, class _Cr>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cr, false> __swap_ranges_unaligned(
+_LIBCUDACXX_HIDE_FROM_ABI __bit_iterator<_Cr, false> __swap_ranges_unaligned(
   __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result)
 {
   using _I1             = __bit_iterator<_Cl, false>;
@@ -798,7 +779,7 @@ inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_C
 }
 
 template <class _Cl, class _Cr>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cr, false> swap_ranges(
+_LIBCUDACXX_HIDE_FROM_ABI __bit_iterator<_Cr, false> swap_ranges(
   __bit_iterator<_Cl, false> __first1, __bit_iterator<_Cl, false> __last1, __bit_iterator<_Cr, false> __first2)
 {
   if (__first1.__ctz_ == __first2.__ctz_)
@@ -824,12 +805,11 @@ struct __bit_array
   difference_type __size_;
   __storage_type __word_[_Np];
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 static difference_type capacity()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 static difference_type capacity()
   {
     return static_cast<difference_type>(_Np * __bits_per_word);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX14 explicit __bit_array(difference_type __s)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit __bit_array(difference_type __s)
       : __size_(__s)
   {
     if (__libcpp_is_constant_evaluated())
@@ -840,11 +820,11 @@ struct __bit_array
       }
     }
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator begin()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator begin()
   {
     return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]), 0);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator end()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator end()
   {
     return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]) + __size_ / __bits_per_word,
                     static_cast<unsigned>(__size_ % __bits_per_word));
@@ -852,7 +832,7 @@ struct __bit_array
 };
 
 template <class _Cp>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false>
 rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle, __bit_iterator<_Cp, false> __last)
 {
   using _I1             = __bit_iterator<_Cp, false>;
@@ -904,7 +884,7 @@ rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle,
 // equal
 
 template <class _Cp, bool _IC1, bool _IC2>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY bool __equal_unaligned(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __equal_unaligned(
   __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
   using _It             = __bit_iterator<_Cp, _IC1>;
@@ -1000,7 +980,7 @@ inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBI
 }
 
 template <class _Cp, bool _IC1, bool _IC2>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY bool __equal_aligned(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __equal_aligned(
   __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
   using _It             = __bit_iterator<_Cp, _IC1>;
@@ -1051,7 +1031,7 @@ inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBI
 }
 
 template <class _Cp, bool _IC1, bool _IC2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 equal(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
   if (__first1.__ctz_ == __first2.__ctz_)
@@ -1082,26 +1062,25 @@ class __bit_iterator
   unsigned __ctz_;
 
 public:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator() noexcept
       : __seg_(nullptr)
       , __ctz_(0)
   {}
 
-  _CCCL_CONSTEXPR_CXX14 __bit_iterator(const __bit_iterator<_Cp, _IsConst>& __it) = default;
+  _CCCL_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator(const __bit_iterator<_Cp, _IsConst>& __it) = default;
 
   template <bool _OtherIsConst, class = __enable_if_t<_IsConst == true && _OtherIsConst == false>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  __bit_iterator(const __bit_iterator<_Cp, _OtherIsConst>& __it) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator(const __bit_iterator<_Cp, _OtherIsConst>& __it) noexcept
       : __seg_(__it.__seg_)
       , __ctz_(__it.__ctz_)
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator*() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator*() const noexcept
   {
     return reference(__seg_, __storage_type(1) << __ctz_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator++()
   {
     if (__ctz_ != __bits_per_word - 1)
     {
@@ -1115,14 +1094,14 @@ class __bit_iterator
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator operator++(int)
   {
     __bit_iterator __tmp = *this;
     ++(*this);
     return __tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator--()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator--()
   {
     if (__ctz_ != 0)
     {
@@ -1136,15 +1115,14 @@ class __bit_iterator
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator operator--(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator operator--(int)
   {
     __bit_iterator __tmp = *this;
     --(*this);
     return __tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator&
-  operator+=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator+=(difference_type __n)
   {
     if (__n >= 0)
     {
@@ -1160,35 +1138,32 @@ class __bit_iterator
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator&
-  operator-=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator& operator-=(difference_type __n)
   {
     return *this += -__n;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator
-  operator+(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator operator+(difference_type __n) const
   {
     __bit_iterator __t(*this);
     __t += __n;
     return __t;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bit_iterator
-  operator-(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator operator-(difference_type __n) const
   {
     __bit_iterator __t(*this);
     __t -= __n;
     return __t;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend __bit_iterator
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend __bit_iterator
   operator+(difference_type __n, const __bit_iterator& __it)
   {
     return __it + __n;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend difference_type
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend difference_type
   operator-(const __bit_iterator& __x, const __bit_iterator& __y)
   {
 #if defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 800 && _GNUC_VER < 900
@@ -1200,50 +1175,49 @@ class __bit_iterator
     return (__x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
-  operator[](difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator[](difference_type __n) const
   {
     return *(*this + __n);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend bool
   operator==(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend bool
   operator!=(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return !(__x == __y);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend bool
   operator<(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return __x.__seg_ < __y.__seg_ || (__x.__seg_ == __y.__seg_ && __x.__ctz_ < __y.__ctz_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend bool
   operator>(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return __y < __x;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend bool
   operator<=(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return !(__y < __x);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend bool
   operator>=(const __bit_iterator& __x, const __bit_iterator& __y)
   {
     return !(__x < __y);
   }
 
 private:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit __bit_iterator(__storage_pointer __s, unsigned __ctz) noexcept
       : __seg_(__s)
       , __ctz_(__ctz)
@@ -1257,53 +1231,53 @@ class __bit_iterator
   template <class _Dp>
   friend struct __bit_array;
   template <bool _FillVal, class _Dp>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend void
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend void
   __fill_n_impl(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
 
   template <class _Dp, bool _IC>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_aligned(
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Dp, false> __copy_aligned(
     __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_unaligned(
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Dp, false> __copy_unaligned(
     __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Dp, false>
   copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_backward_aligned(
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Dp, false> __copy_backward_aligned(
     __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
     __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Dp, false>
   copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Cl, class _Cr>
-  _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Cr, false>
+  _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Cr, false>
     __swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
   template <class _Cl, class _Cr>
-  _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Cr, false>
+  _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Cr, false>
     __swap_ranges_unaligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
   template <class _Cl, class _Cr>
-  _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Cr, false>
+  _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Cr, false>
     swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
   template <class _Dp>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, false>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Dp, false>
     rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>);
   template <class _Dp, bool _IC1, bool _IC2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend bool
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend bool
     __equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
   template <class _Dp, bool _IC1, bool _IC2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend bool
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend bool
     __equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
   template <class _Dp, bool _IC1, bool _IC2>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend bool
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend bool
     equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
   template <bool _ToFind, class _Dp, bool _IC>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator<_Dp, _IC>
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend __bit_iterator<_Dp, _IC>
     __find_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
   template <bool _ToCount, class _Dp, bool _IC>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY friend
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI friend
     typename __bit_iterator<_Dp, _IC>::difference_type __count_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
 };
 
diff --git a/libcudacxx/include/cuda/std/__bit/rotate.h b/libcudacxx/include/cuda/std/__bit/rotate.h
new file mode 100644
index 0000000000..a787f35c06
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__bit/rotate.h
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___BIT_ROTATE_H
+#define _LIBCUDACXX___BIT_ROTATE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotl(_Tp __t, unsigned int __cnt) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned");
+  using __nlt = numeric_limits<_Tp>;
+
+  return ((__cnt % __nlt::digits) == 0)
+         ? __t
+         : (__t << (__cnt % __nlt::digits)) | (__t >> (__nlt::digits - (__cnt % __nlt::digits)));
+}
+
+template <class _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotr(_Tp __t, unsigned int __cnt) noexcept
+{
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned");
+  using __nlt = numeric_limits<_Tp>;
+
+  return ((__cnt % __nlt::digits) == 0)
+         ? __t
+         : (__t >> (__cnt % __nlt::digits)) | (__t << (__nlt::digits - (__cnt % __nlt::digits)));
+}
+
+template <class _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+rotl(_Tp __t, unsigned int __cnt) noexcept
+{
+  return __rotl(__t, __cnt);
+}
+
+// rotr
+template <class _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+rotr(_Tp __t, unsigned int __cnt) noexcept
+{
+  return __rotr(__t, __cnt);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___BIT_ROTATE_H
diff --git a/libcudacxx/include/cuda/std/__cccl/compiler.h b/libcudacxx/include/cuda/std/__cccl/compiler.h
index c3b09f1872..926f3b9be4 100644
--- a/libcudacxx/include/cuda/std/__cccl/compiler.h
+++ b/libcudacxx/include/cuda/std/__cccl/compiler.h
@@ -11,21 +11,23 @@
 #ifndef __CCCL_COMPILER_H
 #define __CCCL_COMPILER_H
 
-// Determine the host compiler
-#if defined(__INTEL_LLVM_COMPILER)
-#  define _CCCL_COMPILER_ICC_LLVM
-#elif defined(__INTEL_COMPILER)
+// Determine the host compiler and its version
+#if defined(__INTEL_COMPILER)
 #  define _CCCL_COMPILER_ICC
 #elif defined(__NVCOMPILER)
 #  define _CCCL_COMPILER_NVHPC
+#  define _CCCL_COMPILER_NVHPC_VERSION \
+    (__NVCOMPILER_MAJOR__ * 10000 + __NVCOMPILER_MINOR__ * 100 + __NVCOMPILER_PATCHLEVEL__)
 #elif defined(__clang__)
 #  define _CCCL_COMPILER_CLANG
+#  define _CCCL_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
 #elif defined(__GNUC__)
 #  define _CCCL_COMPILER_GCC
+#  define _CCCL_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
 #elif defined(_MSC_VER)
 #  define _CCCL_COMPILER_MSVC
-#elif defined(__IBMCPP__)
-#  define _CCCL_COMPILER_IBM
+#  define _CCCL_MSVC_VERSION      _MSC_VER
+#  define _CCCL_MSVC_VERSION_FULL _MSC_FULL_VER
 #elif defined(__CUDACC_RTC__)
 #  define _CCCL_COMPILER_NVRTC
 #endif
@@ -56,7 +58,8 @@
 #  define _CCCL_CUDA_COMPILER
 #endif // cuda compiler available
 
-// clang-cuda does not define __CUDACC_VER_MAJOR__ and friends
+// clang-cuda does not define __CUDACC_VER_MAJOR__ and friends. They are instead retrieved from the CUDA_VERSION macro
+// defined in "cuda.h". clang-cuda automatically pre-includes "__clang_cuda_runtime_wrapper.h" which includes "cuda.h"
 #if defined(_CCCL_CUDA_COMPILER_CLANG)
 #  define _CCCL_CUDACC
 #  define _CCCL_CUDACC_VER_MAJOR CUDA_VERSION / 1000
@@ -80,6 +83,9 @@
 #endif // defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1103000
 #if !defined(_CCCL_CUDA_COMPILER) || (defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1104000)
 #  define _CCCL_CUDACC_BELOW_11_4
+#endif // defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 11074000
+#if !defined(_CCCL_CUDA_COMPILER) || (defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1107000)
+#  define _CCCL_CUDACC_BELOW_11_7
 #endif // defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1104000
 #if !defined(_CCCL_CUDA_COMPILER) || (defined(_CCCL_CUDACC) && _CCCL_CUDACC_VER < 1108000)
 #  define _CCCL_CUDACC_BELOW_11_8
diff --git a/libcudacxx/include/cuda/std/__cccl/diagnostic.h b/libcudacxx/include/cuda/std/__cccl/diagnostic.h
index 64f27049fe..617682e31a 100644
--- a/libcudacxx/include/cuda/std/__cccl/diagnostic.h
+++ b/libcudacxx/include/cuda/std/__cccl/diagnostic.h
@@ -163,7 +163,7 @@
 #endif // !_CCCL_COMPILER_MSVC
 
 #ifndef _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO
-#  if defined(_CCCL_COMPILER_MSVC_2017) || defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_COMPILER_IBM)
+#  if defined(_CCCL_COMPILER_MSVC_2017) || defined(_CCCL_COMPILER_NVRTC)
 #    define _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO
 #  endif
 #endif // _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO
diff --git a/libcudacxx/include/cuda/std/__cccl/exceptions.h b/libcudacxx/include/cuda/std/__cccl/exceptions.h
new file mode 100644
index 0000000000..adbca95528
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cccl/exceptions.h
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CCCL_EXCEPTIONS_H
+#define __CCCL_EXCEPTIONS_H
+
+#include <cuda/std/__cccl/compiler.h>
+#include <cuda/std/__cccl/system_header.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#ifndef _CCCL_NO_EXCEPTIONS
+#  if defined(CCCL_DISABLE_EXCEPTIONS) // Escape hatch for users to manually disable exceptions
+#    define _CCCL_NO_EXCEPTIONS
+#  elif defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_COMPILER_MSVC) && _HAS_EXCEPTIONS == 0) \
+    || (!defined(_CCCL_COMPILER_MSVC) && !__EXCEPTIONS) // Catches all non msvc based compilers
+#    define _CCCL_NO_EXCEPTIONS
+#  endif
+#endif // !_CCCL_NO_EXCEPTIONS
+
+#endif // __CCCL_EXCEPTIONS_H
diff --git a/libcudacxx/include/cuda/std/__cccl/system_header.h b/libcudacxx/include/cuda/std/__cccl/system_header.h
index 9fa79762c0..63a75c3b4e 100644
--- a/libcudacxx/include/cuda/std/__cccl/system_header.h
+++ b/libcudacxx/include/cuda/std/__cccl/system_header.h
@@ -14,8 +14,7 @@
 #include <cuda/std/__cccl/compiler.h>
 
 // Enforce that cccl headers are treated as system headers
-#if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_NVHPC) || defined(_CCCL_COMPILER_ICC) \
-  || defined(_CCCL_COMPILER_ICC_LLVM)
+#if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_NVHPC) || defined(_CCCL_COMPILER_ICC)
 #  define _CCCL_FORCE_SYSTEM_HEADER_GCC
 #elif defined(_CCCL_COMPILER_CLANG)
 #  define _CCCL_FORCE_SYSTEM_HEADER_CLANG
@@ -27,8 +26,7 @@
 #if !defined(_CCCL_NO_SYSTEM_HEADER)                                                     \
   && !(defined(_CCCL_COMPILER_MSVC) && defined(_LIBCUDACXX_DISABLE_PRAGMA_MSVC_WARNING)) \
   && !defined(_CCCL_COMPILER_NVRTC) && !defined(_LIBCUDACXX_DISABLE_PRAGMA_GCC_SYSTEM_HEADER)
-#  if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_NVHPC) || defined(_CCCL_COMPILER_ICC) \
-    || defined(_CCCL_COMPILER_ICC_LLVM)
+#  if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_NVHPC) || defined(_CCCL_COMPILER_ICC)
 #    define _CCCL_IMPLICIT_SYSTEM_HEADER_GCC
 #  elif defined(_CCCL_COMPILER_CLANG)
 #    define _CCCL_IMPLICIT_SYSTEM_HEADER_CLANG
diff --git a/libcudacxx/include/cuda/std/__cccl/version.h b/libcudacxx/include/cuda/std/__cccl/version.h
index d8ed5b6c80..5ef3b20c0f 100644
--- a/libcudacxx/include/cuda/std/__cccl/version.h
+++ b/libcudacxx/include/cuda/std/__cccl/version.h
@@ -14,7 +14,7 @@
 #ifndef __CCCL_VERSION_H
 #define __CCCL_VERSION_H
 
-#define CCCL_VERSION 2006000
+#define CCCL_VERSION 2007000
 #define CCCL_MAJOR_VERSION (CCCL_VERSION / 1000000)
 #define CCCL_MINOR_VERSION (((CCCL_VERSION / 1000) % 1000))
 #define CCCL_PATCH_VERSION (CCCL_VERSION % 1000)
diff --git a/libcudacxx/include/cuda/std/__cccl/visibility.h b/libcudacxx/include/cuda/std/__cccl/visibility.h
index 6b39b383d5..64c964d81f 100644
--- a/libcudacxx/include/cuda/std/__cccl/visibility.h
+++ b/libcudacxx/include/cuda/std/__cccl/visibility.h
@@ -27,6 +27,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__cccl/execution_space.h>
+
 // For unknown reasons, nvc++ need to selectively disable this warning
 // We do not want to use our usual macro because that would have push / pop semantics
 #if defined(_CCCL_COMPILER_NVHPC)
@@ -40,18 +42,44 @@
 #  define _CCCL_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden")))
 #endif // !_CCCL_COMPILER_NVRTC
 
-#if defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#if defined(_CCCL_COMPILER_MSVC)
+#  define _CCCL_VISIBILITY_DEFAULT __declspec(dllimport)
+#elif defined(_CCCL_COMPILER_NVRTC) // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv _CCCL_COMPILER_NVRTC vvv
 #  define _CCCL_VISIBILITY_DEFAULT
-#else // ^^^ _CCCL_COMPILER_NVRTC ^^^ / vvv _CCCL_COMPILER_NVRTC vvv
+#else // ^^^ _CCCL_COMPILER_NVRTC ^^^ / vvv !_CCCL_COMPILER_NVRTC vvv
 #  define _CCCL_VISIBILITY_DEFAULT __attribute__((__visibility__("default")))
 #endif // !_CCCL_COMPILER_NVRTC
 
-#if defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC) || !__has_attribute(__type_visibility__)
+#if defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
 #  define _CCCL_TYPE_VISIBILITY_DEFAULT
-#else // ^^^ _CCCL_COMPILER_NVRTC ^^^ / vvv _CCCL_COMPILER_NVRTC vvv
+#elif __has_attribute(__type_visibility__)
 #  define _CCCL_TYPE_VISIBILITY_DEFAULT __attribute__((__type_visibility__("default")))
+#else // ^^^ __has_attribute(__type_visibility__) ^^^ / vvv !__has_attribute(__type_visibility__) vvv
+#  define _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_VISIBILITY_DEFAULT
 #endif // !_CCCL_COMPILER_NVRTC
 
+#if defined(_CCCL_COMPILER_MSVC)
+#  define _CCCL_ALWAYS_INLINE __forceinline
+#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv _CCCL_COMPILER_MSVC vvv
+#  define _CCCL_ALWAYS_INLINE __attribute__((__always_inline__))
+#endif // !_CCCL_COMPILER_MSVC
+
+#if __has_attribute(exclude_from_explicit_instantiation)
+#  define _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION __attribute__((exclude_from_explicit_instantiation))
+#else // ^^^ exclude_from_explicit_instantiation ^^^ / vvv !exclude_from_explicit_instantiation vvv
+// NVCC complains mightily about being unable to inline functions if we use _CCCL_ALWAYS_INLINE here
+#  define _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
+#endif // !exclude_from_explicit_instantiation
+
+#if defined(_CCCL_COMPILER_ICC) // ICC has issues with visibility attributes on symbols with internal linkage
+#  define _CCCL_HIDE_FROM_ABI inline
+#else // ^^^ _CCCL_COMPILER_ICC ^^^ / vvv !_CCCL_COMPILER_ICC vvv
+#  define _CCCL_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION inline
+#endif // !_CCCL_COMPILER_ICC
+
+//! Defined here to supress any warnings from the definition
+#define _LIBCUDACXX_HIDE_FROM_ABI _CCCL_HIDE_FROM_ABI _CCCL_HOST_DEVICE
+
 #if !defined(CCCL_DETAIL_KERNEL_ATTRIBUTES)
 #  define CCCL_DETAIL_KERNEL_ATTRIBUTES __global__ _CCCL_VISIBILITY_HIDDEN
 #endif // !CCCL_DETAIL_KERNEL_ATTRIBUTES
diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h
index 612ebba335..4657e1f605 100644
--- a/libcudacxx/include/cuda/std/__complex/nvbf16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h
@@ -81,23 +81,23 @@ struct __complex_can_implicitly_construct<double, __nv_bfloat16> : true_type
 {};
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __convert_to_bfloat16(const _Tp& __value) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __convert_to_bfloat16(const _Tp& __value) noexcept
 {
   return __value;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __convert_to_bfloat16(const float& __value) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __convert_to_bfloat16(const float& __value) noexcept
 {
   return __float2bfloat16(__value);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __convert_to_bfloat16(const double& __value) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __convert_to_bfloat16(const double& __value) noexcept
 {
   return __double2bfloat16(__value);
 }
 
 template <>
-class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__nv_bfloat16>
+class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__nv_bfloat16>
 {
   __nv_bfloat162 __repr_;
 
@@ -107,23 +107,23 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
 public:
   using value_type = __nv_bfloat16;
 
-  _LIBCUDACXX_INLINE_VISIBILITY complex(const value_type& __re = value_type(), const value_type& __im = value_type())
+  _LIBCUDACXX_HIDE_FROM_ABI complex(const value_type& __re = value_type(), const value_type& __im = value_type())
       : __repr_(__re, __im)
   {}
 
   template <class _Up, __enable_if_t<__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY complex(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI complex(const complex<_Up>& __c)
       : __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag()))
   {}
 
   template <class _Up,
             __enable_if_t<!__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0,
             __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int>              = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit complex(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit complex(const complex<_Up>& __c)
       : __repr_(__convert_to_bfloat16(__c.real()), __convert_to_bfloat16(__c.imag()))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator=(const value_type& __re)
   {
     __repr_.x = __re;
     __repr_.y = value_type();
@@ -131,7 +131,7 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
   }
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator=(const complex<_Up>& __c)
   {
     __repr_.x = __convert_to_bfloat16(__c.real());
     __repr_.y = __convert_to_bfloat16(__c.imag());
@@ -140,12 +140,12 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
 
 #  if !defined(_CCCL_COMPILER_NVRTC)
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY complex(const ::std::complex<_Up>& __other)
+  _LIBCUDACXX_HIDE_FROM_ABI complex(const ::std::complex<_Up>& __other)
       : __repr_(_LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__other), _LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__other))
   {}
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const ::std::complex<_Up>& __other)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator=(const ::std::complex<_Up>& __other)
   {
     __repr_.x = _LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__other);
     __repr_.y = _LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__other);
@@ -158,51 +158,51 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
   }
 #  endif // !_CCCL_COMPILER_NVRTC
 
-  _LIBCUDACXX_INLINE_VISIBILITY value_type real() const
+  _LIBCUDACXX_HIDE_FROM_ABI value_type real() const
   {
     return __repr_.x;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY value_type imag() const
+  _LIBCUDACXX_HIDE_FROM_ABI value_type imag() const
   {
     return __repr_.y;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void real(value_type __re)
+  _LIBCUDACXX_HIDE_FROM_ABI void real(value_type __re)
   {
     __repr_.x = __re;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void imag(value_type __im)
+  _LIBCUDACXX_HIDE_FROM_ABI void imag(value_type __im)
   {
     __repr_.y = __im;
   }
 
   // Those additional volatile overloads are meant to help with reductions in thrust
-  _LIBCUDACXX_INLINE_VISIBILITY value_type real() const volatile
+  _LIBCUDACXX_HIDE_FROM_ABI value_type real() const volatile
   {
     return __repr_.x;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY value_type imag() const volatile
+  _LIBCUDACXX_HIDE_FROM_ABI value_type imag() const volatile
   {
     return __repr_.y;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator+=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator+=(const value_type& __re)
   {
     __repr_.x = __hadd(__repr_.x, __re);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator-=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator-=(const value_type& __re)
   {
     __repr_.x = __hsub(__repr_.x, __re);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator*=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator*=(const value_type& __re)
   {
     __repr_.x = __hmul(__repr_.x, __re);
     __repr_.y = __hmul(__repr_.y, __re);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator/=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator/=(const value_type& __re)
   {
     __repr_.x = __hdiv(__repr_.x, __re);
     __repr_.y = __hdiv(__repr_.y, __re);
@@ -210,19 +210,19 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
   }
 
   // We can utilize vectorized operations for those operators
-  _LIBCUDACXX_INLINE_VISIBILITY friend complex& operator+=(complex& __lhs, const complex& __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI friend complex& operator+=(complex& __lhs, const complex& __rhs) noexcept
   {
     __lhs.__repr_ = __hadd2(__lhs.__repr_, __rhs.__repr_);
     return __lhs;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend complex& operator-=(complex& __lhs, const complex& __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI friend complex& operator-=(complex& __lhs, const complex& __rhs) noexcept
   {
     __lhs.__repr_ = __hsub2(__lhs.__repr_, __rhs.__repr_);
     return __lhs;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator==(const complex& __lhs, const complex& __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool operator==(const complex& __lhs, const complex& __rhs) noexcept
   {
     return __hbeq2(__lhs.__repr_, __rhs.__repr_);
   }
@@ -230,21 +230,21 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__nv_bfloat162)) complex<__
 
 template <> // complex<float>
 template <> // complex<__half>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>::complex(const complex<__nv_bfloat16>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI complex<float>::complex(const complex<__nv_bfloat16>& __c)
     : __re_(__bfloat162float(__c.real()))
     , __im_(__bfloat162float(__c.imag()))
 {}
 
 template <> // complex<double>
 template <> // complex<__half>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<double>::complex(const complex<__nv_bfloat16>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI complex<double>::complex(const complex<__nv_bfloat16>& __c)
     : __re_(__bfloat162float(__c.real()))
     , __im_(__bfloat162float(__c.imag()))
 {}
 
 template <> // complex<float>
 template <> // complex<__nv_bfloat16>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>& complex<float>::operator=(const complex<__nv_bfloat16>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI complex<float>& complex<float>::operator=(const complex<__nv_bfloat16>& __c)
 {
   __re_ = __bfloat162float(__c.real());
   __im_ = __bfloat162float(__c.imag());
@@ -253,36 +253,36 @@ inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>& complex<float>::operator=(c
 
 template <> // complex<double>
 template <> // complex<__nv_bfloat16>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<double>& complex<double>::operator=(const complex<__nv_bfloat16>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI complex<double>& complex<double>::operator=(const complex<__nv_bfloat16>& __c)
 {
   __re_ = __bfloat162float(__c.real());
   __im_ = __bfloat162float(__c.imag());
   return *this;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 arg(__nv_bfloat16 __re)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 arg(__nv_bfloat16 __re)
 {
   return _CUDA_VSTD::atan2(__int2bfloat16_rn(0), __re);
 }
 
 // We have performance issues with some trigonometric functions with __nv_bfloat16
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__nv_bfloat16> asinh(const complex<__nv_bfloat16>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__nv_bfloat16> asinh(const complex<__nv_bfloat16>& __x)
 {
   return complex<__nv_bfloat16>{_CUDA_VSTD::asinh(complex<float>{__x})};
 }
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__nv_bfloat16> acosh(const complex<__nv_bfloat16>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__nv_bfloat16> acosh(const complex<__nv_bfloat16>& __x)
 {
   return complex<__nv_bfloat16>{_CUDA_VSTD::acosh(complex<float>{__x})};
 }
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__nv_bfloat16> atanh(const complex<__nv_bfloat16>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__nv_bfloat16> atanh(const complex<__nv_bfloat16>& __x)
 {
   return complex<__nv_bfloat16>{_CUDA_VSTD::atanh(complex<float>{__x})};
 }
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__nv_bfloat16> acos(const complex<__nv_bfloat16>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__nv_bfloat16> acos(const complex<__nv_bfloat16>& __x)
 {
   return complex<__nv_bfloat16>{_CUDA_VSTD::acos(complex<float>{__x})};
 }
diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h
index b3154a4b23..a6e704ccb8 100644
--- a/libcudacxx/include/cuda/std/__complex/nvfp16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h
@@ -78,23 +78,23 @@ struct __complex_can_implicitly_construct<double, __half> : true_type
 {};
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __half __convert_to_half(const _Tp& __value) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __half __convert_to_half(const _Tp& __value) noexcept
 {
   return __value;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half __convert_to_half(const float& __value) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __half __convert_to_half(const float& __value) noexcept
 {
   return __float2half(__value);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half __convert_to_half(const double& __value) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __half __convert_to_half(const double& __value) noexcept
 {
   return __double2half(__value);
 }
 
 template <>
-class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
+class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
 {
   __half2 __repr_;
 
@@ -104,23 +104,23 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
 public:
   using value_type = __half;
 
-  _LIBCUDACXX_INLINE_VISIBILITY complex(const value_type& __re = value_type(), const value_type& __im = value_type())
+  _LIBCUDACXX_HIDE_FROM_ABI complex(const value_type& __re = value_type(), const value_type& __im = value_type())
       : __repr_(__re, __im)
   {}
 
   template <class _Up, __enable_if_t<__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY complex(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI complex(const complex<_Up>& __c)
       : __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag()))
   {}
 
   template <class _Up,
             __enable_if_t<!__complex_can_implicitly_construct<value_type, _Up>::value, int> = 0,
             __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Up), int>              = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit complex(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit complex(const complex<_Up>& __c)
       : __repr_(__convert_to_half(__c.real()), __convert_to_half(__c.imag()))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator=(const value_type& __re)
   {
     __repr_.x = __re;
     __repr_.y = value_type();
@@ -128,7 +128,7 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
   }
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator=(const complex<_Up>& __c)
   {
     __repr_.x = __convert_to_half(__c.real());
     __repr_.y = __convert_to_half(__c.imag());
@@ -137,12 +137,12 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
 
 #  if !defined(_CCCL_COMPILER_NVRTC)
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY complex(const ::std::complex<_Up>& __other)
+  _LIBCUDACXX_HIDE_FROM_ABI complex(const ::std::complex<_Up>& __other)
       : __repr_(_LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__other), _LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__other))
   {}
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const ::std::complex<_Up>& __other)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator=(const ::std::complex<_Up>& __other)
   {
     __repr_.x = _LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__other);
     __repr_.y = _LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__other);
@@ -155,51 +155,51 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
   }
 #  endif // !_CCCL_COMPILER_NVRTC
 
-  _LIBCUDACXX_INLINE_VISIBILITY value_type real() const
+  _LIBCUDACXX_HIDE_FROM_ABI value_type real() const
   {
     return __repr_.x;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY value_type imag() const
+  _LIBCUDACXX_HIDE_FROM_ABI value_type imag() const
   {
     return __repr_.y;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void real(value_type __re)
+  _LIBCUDACXX_HIDE_FROM_ABI void real(value_type __re)
   {
     __repr_.x = __re;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void imag(value_type __im)
+  _LIBCUDACXX_HIDE_FROM_ABI void imag(value_type __im)
   {
     __repr_.y = __im;
   }
 
   // Those additional volatile overloads are meant to help with reductions in thrust
-  _LIBCUDACXX_INLINE_VISIBILITY value_type real() const volatile
+  _LIBCUDACXX_HIDE_FROM_ABI value_type real() const volatile
   {
     return __repr_.x;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY value_type imag() const volatile
+  _LIBCUDACXX_HIDE_FROM_ABI value_type imag() const volatile
   {
     return __repr_.y;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator+=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator+=(const value_type& __re)
   {
     __repr_.x = __hadd(__repr_.x, __re);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator-=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator-=(const value_type& __re)
   {
     __repr_.x = __hsub(__repr_.x, __re);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator*=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator*=(const value_type& __re)
   {
     __repr_.x = __hmul(__repr_.x, __re);
     __repr_.y = __hmul(__repr_.y, __re);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator/=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator/=(const value_type& __re)
   {
     __repr_.x = __hdiv(__repr_.x, __re);
     __repr_.y = __hdiv(__repr_.y, __re);
@@ -207,19 +207,19 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
   }
 
   // We can utilize vectorized operations for those operators
-  _LIBCUDACXX_INLINE_VISIBILITY friend complex& operator+=(complex& __lhs, const complex& __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI friend complex& operator+=(complex& __lhs, const complex& __rhs) noexcept
   {
     __lhs.__repr_ = __hadd2(__lhs.__repr_, __rhs.__repr_);
     return __lhs;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend complex& operator-=(complex& __lhs, const complex& __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI friend complex& operator-=(complex& __lhs, const complex& __rhs) noexcept
   {
     __lhs.__repr_ = __hsub2(__lhs.__repr_, __rhs.__repr_);
     return __lhs;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator==(const complex& __lhs, const complex& __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool operator==(const complex& __lhs, const complex& __rhs) noexcept
   {
     return __hbeq2(__lhs.__repr_, __rhs.__repr_);
   }
@@ -227,21 +227,21 @@ class _LIBCUDACXX_TEMPLATE_VIS _CCCL_ALIGNAS(alignof(__half2)) complex<__half>
 
 template <> // complex<float>
 template <> // complex<__half>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>::complex(const complex<__half>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI complex<float>::complex(const complex<__half>& __c)
     : __re_(__half2float(__c.real()))
     , __im_(__half2float(__c.imag()))
 {}
 
 template <> // complex<double>
 template <> // complex<__half>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<double>::complex(const complex<__half>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI complex<double>::complex(const complex<__half>& __c)
     : __re_(__half2float(__c.real()))
     , __im_(__half2float(__c.imag()))
 {}
 
 template <> // complex<float>
 template <> // complex<__half>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>& complex<float>::operator=(const complex<__half>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI complex<float>& complex<float>::operator=(const complex<__half>& __c)
 {
   __re_ = __half2float(__c.real());
   __im_ = __half2float(__c.imag());
@@ -250,36 +250,36 @@ inline _LIBCUDACXX_INLINE_VISIBILITY complex<float>& complex<float>::operator=(c
 
 template <> // complex<double>
 template <> // complex<__half>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<double>& complex<double>::operator=(const complex<__half>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI complex<double>& complex<double>::operator=(const complex<__half>& __c)
 {
   __re_ = __half2float(__c.real());
   __im_ = __half2float(__c.imag());
   return *this;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half arg(__half __re)
+_LIBCUDACXX_HIDE_FROM_ABI __half arg(__half __re)
 {
   return _CUDA_VSTD::atan2(__int2half_rn(0), __re);
 }
 
 // We have performance issues with some trigonometric functions with __half
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__half> asinh(const complex<__half>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__half> asinh(const complex<__half>& __x)
 {
   return complex<__half>{_CUDA_VSTD::asinh(complex<float>{__x})};
 }
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__half> acosh(const complex<__half>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__half> acosh(const complex<__half>& __x)
 {
   return complex<__half>{_CUDA_VSTD::acosh(complex<float>{__x})};
 }
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__half> atanh(const complex<__half>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__half> atanh(const complex<__half>& __x)
 {
   return complex<__half>{_CUDA_VSTD::atanh(complex<float>{__x})};
 }
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__half> acos(const complex<__half>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__half> acos(const complex<__half>& __x)
 {
   return complex<__half>{_CUDA_VSTD::acos(complex<float>{__x})};
 }
diff --git a/libcudacxx/include/cuda/std/__complex/vector_support.h b/libcudacxx/include/cuda/std/__complex/vector_support.h
index 2c754036cb..f7f23c32ed 100644
--- a/libcudacxx/include/cuda/std/__complex/vector_support.h
+++ b/libcudacxx/include/cuda/std/__complex/vector_support.h
@@ -102,21 +102,21 @@ struct __ab_results
 };
 
 template <class _Tp, typename = __enable_if_t<!__has_vector_type<_Tp>::value>>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __abcd_results<_Tp>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __abcd_results<_Tp>
 __complex_calculate_partials(_Tp __a, _Tp __b, _Tp __c, _Tp __d) noexcept
 {
   return {__a * __c, __b * __d, __a * __d, __b * __c};
 }
 
 template <class _Tp, typename = __enable_if_t<!__has_vector_type<_Tp>::value>>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __ab_results<_Tp>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __ab_results<_Tp>
 __complex_piecewise_mul(_Tp __x1, _Tp __y1, _Tp __x2, _Tp __y2) noexcept
 {
   return {__x1 * __x2, __y1 * __y2};
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __enable_if_t<__has_vector_type<_Tp>::value, __abcd_results<_Tp>>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __enable_if_t<__has_vector_type<_Tp>::value, __abcd_results<_Tp>>
 __complex_calculate_partials(_Tp __a, _Tp __b, _Tp __c, _Tp __d) noexcept
 {
   __abcd_results<_Tp> __ret;
@@ -139,7 +139,7 @@ __complex_calculate_partials(_Tp __a, _Tp __b, _Tp __c, _Tp __d) noexcept
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __enable_if_t<__has_vector_type<_Tp>::value, __ab_results<_Tp>>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __enable_if_t<__has_vector_type<_Tp>::value, __ab_results<_Tp>>
 __complex_piecewise_mul(_Tp __x1, _Tp __y1, _Tp __x2, _Tp __y2) noexcept
 {
   __ab_results<_Tp> __ret;
diff --git a/libcudacxx/include/cuda/std/__concepts/__concept_macros.h b/libcudacxx/include/cuda/std/__concepts/__concept_macros.h
index f38e6c9ac2..a7cef351c2 100644
--- a/libcudacxx/include/cuda/std/__concepts/__concept_macros.h
+++ b/libcudacxx/include/cuda/std/__concepts/__concept_macros.h
@@ -263,14 +263,14 @@
 #    define _LIBCUDACXX_CONCEPT _LIBCUDACXX_INLINE_VAR constexpr bool
 
 #    define _LIBCUDACXX_CONCEPT_FRAGMENT(_NAME, ...)                                            \
-      _LIBCUDACXX_INLINE_VISIBILITY auto _NAME##_LIBCUDACXX_CONCEPT_FRAGMENT_impl_              \
+      _LIBCUDACXX_HIDE_FROM_ABI auto _NAME##_LIBCUDACXX_CONCEPT_FRAGMENT_impl_                  \
           _LIBCUDACXX_CONCEPT_FRAGMENT_REQS_##__VA_ARGS__                                       \
         >                                                                                       \
       {}                                                                                        \
       template <typename... _As>                                                                \
-      _LIBCUDACXX_INLINE_VISIBILITY char _NAME##_LIBCUDACXX_CONCEPT_FRAGMENT_(                  \
+      _LIBCUDACXX_HIDE_FROM_ABI char _NAME##_LIBCUDACXX_CONCEPT_FRAGMENT_(                      \
         _Concept::_Tag<_As...>*, decltype(&_NAME##_LIBCUDACXX_CONCEPT_FRAGMENT_impl_<_As...>)); \
-      _LIBCUDACXX_INLINE_VISIBILITY char(&_NAME##_LIBCUDACXX_CONCEPT_FRAGMENT_(...))[2] /**/
+      _LIBCUDACXX_HIDE_FROM_ABI char(&_NAME##_LIBCUDACXX_CONCEPT_FRAGMENT_(...))[2] /**/
 #    if defined(_MSC_VER) && !defined(__clang__)
 #      define _LIBCUDACXX_CONCEPT_FRAGMENT_TRUE(...) \
         _Concept::_Is_true<decltype(_LIBCUDACXX_PP_FOR_EACH(_LIBCUDACXX_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__) void())>()
@@ -345,14 +345,14 @@ using _Requires_t = typename _Select<_Bp>::template type<_Tp>;
 template <typename...>
 struct _Tag;
 template <class>
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool _Is_true()
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool _Is_true()
 {
   return true;
 }
 
 #  if defined(_CCCL_COMPILER_CLANG) || defined(_CCCL_COMPILER_MSVC)
 template <bool _Bp>
-_LIBCUDACXX_INLINE_VISIBILITY _Concept::_Enable_if_t<_Bp> _Requires()
+_LIBCUDACXX_HIDE_FROM_ABI _Concept::_Enable_if_t<_Bp> _Requires()
 {}
 #  else
 template <bool _Bp, _Concept::_Enable_if_t<_Bp, int> = 0>
diff --git a/libcudacxx/include/cuda/std/__concepts/all_of.h b/libcudacxx/include/cuda/std/__concepts/all_of.h
index 31ea565c4d..3ca445e821 100644
--- a/libcudacxx/include/cuda/std/__concepts/all_of.h
+++ b/libcudacxx/include/cuda/std/__concepts/all_of.h
@@ -21,8 +21,7 @@
 #endif // no system header
 
 #include "../__concepts/__concept_macros.h"
-#include "../__type_traits/disjunction.h"
-#include "../__type_traits/is_same.h"
+#include "../__type_traits/is_same.h" // IWYU pragma: keep
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -34,7 +33,7 @@ template <bool... _Preds>
 struct __all_of_helper;
 
 template <bool... _Preds>
-_LIBCUDACXX_CONCEPT __all_of = _IsSame<__all_of_helper<_Preds...>, __all_of_helper<((void) _Preds, true)...>>::value;
+_LIBCUDACXX_CONCEPT __all_of = _IsSame<__all_of_helper<true, _Preds...>, __all_of_helper<_Preds..., true>>::value;
 #endif // _CCCL_STD_VER > 2011
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__concepts/swappable.h b/libcudacxx/include/cuda/std/__concepts/swappable.h
index f148b54dc6..d81740a41b 100644
--- a/libcudacxx/include/cuda/std/__concepts/swappable.h
+++ b/libcudacxx/include/cuda/std/__concepts/swappable.h
@@ -107,7 +107,7 @@ struct __fn
   // *The name `swap` is used here unqualified.
   _LIBCUDACXX_TEMPLATE(class _Tp, class _Up)
   _LIBCUDACXX_REQUIRES(__unqualified_swappable_with<_Tp, _Up>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator()(_Tp&& __t, _Up&& __u) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator()(_Tp&& __t, _Up&& __u) const
     noexcept(noexcept(swap(_CUDA_VSTD::forward<_Tp>(__t), _CUDA_VSTD::forward<_Up>(__u))))
   {
     swap(_CUDA_VSTD::forward<_Tp>(__t), _CUDA_VSTD::forward<_Up>(__u));
@@ -116,7 +116,7 @@ struct __fn
   // 2.2   Otherwise, if `E1` and `E2` are lvalues of array types with equal extent and...
   _LIBCUDACXX_TEMPLATE(class _Tp, class _Up, size_t _Size)
   _LIBCUDACXX_REQUIRES(__swappable_arrays<_Tp, _Up, _Size>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator()(_Tp (&__t)[_Size], _Up (&__u)[_Size]) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator()(_Tp (&__t)[_Size], _Up (&__u)[_Size]) const
     noexcept(__noexcept_swappable_arrays<_Tp, _Up>)
   {
     // TODO(cjdb): replace with `_CUDA_VRANGES::swap_ranges`.
@@ -129,7 +129,7 @@ struct __fn
   // 2.3   Otherwise, if `E1` and `E2` are lvalues of the same type `T` that models...
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__exchangeable<_Tp>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator()(_Tp& __x, _Tp& __y) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator()(_Tp& __x, _Tp& __y) const
     noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Tp) && _CCCL_TRAIT(is_nothrow_move_assignable, _Tp))
   {
     __y = _CUDA_VSTD::exchange(__x, _CUDA_VSTD::move(__y));
diff --git a/libcudacxx/include/cuda/std/__cuda/atomic.h b/libcudacxx/include/cuda/std/__cuda/atomic.h
index 3baa1c6107..2ce5e4f580 100644
--- a/libcudacxx/include/cuda/std/__cuda/atomic.h
+++ b/libcudacxx/include/cuda/std/__cuda/atomic.h
@@ -32,9 +32,9 @@ struct atomic : public _CUDA_VSTD::__atomic_impl<_Tp, _Sco>
 {
   using value_type = _Tp;
 
-  constexpr atomic() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr atomic() noexcept = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr atomic(_Tp __d) noexcept
       : _CUDA_VSTD::__atomic_impl<_Tp, _Sco>(__d)
   {}
 
@@ -42,31 +42,31 @@ struct atomic : public _CUDA_VSTD::__atomic_impl<_Tp, _Sco>
   atomic& operator=(const atomic&)          = delete;
   atomic& operator=(const atomic&) volatile = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp operator=(_Tp __d) volatile noexcept
   {
     this->store(__d);
     return __d;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp operator=(_Tp __d) noexcept
   {
     this->store(__d);
     return __d;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
     return _CUDA_VSTD::__atomic_fetch_max_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
     return _CUDA_VSTD::__atomic_fetch_max_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
     return _CUDA_VSTD::__atomic_fetch_min_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
     return _CUDA_VSTD::__atomic_fetch_min_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
@@ -83,26 +83,26 @@ struct atomic_ref : public _CUDA_VSTD::__atomic_ref_impl<_Tp, _Sco>
 
   static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr atomic_ref(_Tp& __ref)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr atomic_ref(_Tp& __ref)
       : _CUDA_VSTD::__atomic_ref_impl<_Tp, _Sco>(__ref)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp operator=(_Tp __v) const noexcept
   {
     this->store(__v);
     return __v;
   }
 
-  atomic_ref(const atomic_ref&) noexcept         = default;
-  atomic_ref& operator=(const atomic_ref&)       = delete;
-  atomic_ref& operator=(const atomic_ref&) const = delete;
+  _CCCL_HIDE_FROM_ABI atomic_ref(const atomic_ref&) noexcept = default;
+  atomic_ref& operator=(const atomic_ref&)                   = delete;
+  atomic_ref& operator=(const atomic_ref&) const             = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
     return _CUDA_VSTD::__atomic_fetch_max_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
     return _CUDA_VSTD::__atomic_fetch_min_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
diff --git a/libcudacxx/include/cuda/std/__cuda/barrier.h b/libcudacxx/include/cuda/std/__cuda/barrier.h
index 3bb2efec35..dd4146105c 100644
--- a/libcudacxx/include/cuda/std/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/__cuda/barrier.h
@@ -27,7 +27,7 @@
 
 #include <cuda/std/__atomic/api/owned.h>
 #include <cuda/std/__type_traits/void_t.h> // _CUDA_VSTD::void_t
-#include <cuda/std/detail/libcxx/include/cstdlib> // _LIBCUDACXX_UNREACHABLE
+#include <cuda/std/cstdlib> // _LIBCUDACXX_UNREACHABLE
 
 #if defined(_CCCL_CUDA_COMPILER)
 #  include <cuda/ptx> // cuda::ptx::*
@@ -50,10 +50,10 @@ struct aligned_size_t
 {
   static constexpr _CUDA_VSTD::size_t align = _Alignment;
   _CUDA_VSTD::size_t value;
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr aligned_size_t(size_t __s)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s)
       : value(__s)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr operator size_t() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const
   {
     return value;
   }
@@ -85,24 +85,23 @@ template <thread_scope _Sco, class _CompletionF = _CUDA_VSTD::__empty_completion
 class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>
 {
 public:
-  barrier() = default;
+  _CCCL_HIDE_FROM_ABI barrier() = default;
 
   barrier(const barrier&)            = delete;
   barrier& operator=(const barrier&) = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected,
-                                                  _CompletionF __completion = _CompletionF())
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected,
+                                              _CompletionF __completion = _CompletionF())
       : _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>(__expected, __completion)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected)
+  _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected)
   {
     _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
     new (__b) barrier(__expected);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend void
-  init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion)
+  _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion)
   {
     _LIBCUDACXX_DEBUG_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
     new (__b) barrier(__expected, __completion);
@@ -137,21 +136,21 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
   friend class _CUDA_VSTD::__barrier_poll_tester_parity;
 
 public:
-  using arrival_token = typename __barrier_base::arrival_token;
-  barrier()           = default;
+  using arrival_token           = typename __barrier_base::arrival_token;
+  _CCCL_HIDE_FROM_ABI barrier() = default;
 
   barrier(const barrier&)            = delete;
   barrier& operator=(const barrier&) = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY barrier(_CUDA_VSTD::ptrdiff_t __expected,
-                                        _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion())
+  _LIBCUDACXX_HIDE_FROM_ABI barrier(_CUDA_VSTD::ptrdiff_t __expected,
+                                    _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion())
   {
     static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier<thread_scope_block>, __barrier),
                   "fatal error: bad barrier layout");
     init(this, __expected, __completion);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY ~barrier()
+  _LIBCUDACXX_HIDE_FROM_ABI ~barrier()
   {
     NV_DISPATCH_TARGET(
       NV_PROVIDES_SM_90,
@@ -169,7 +168,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
       }))
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend void init(
+  _LIBCUDACXX_HIDE_FROM_ABI friend void init(
     barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CUDA_VSTD::__empty_completion = _CUDA_VSTD::__empty_completion())
   {
     NV_DISPATCH_TARGET(
@@ -195,7 +194,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
       (new (&__b->__barrier) __barrier_base(__expected);))
   }
 
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(_CUDA_VSTD::ptrdiff_t __update = 1)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(_CUDA_VSTD::ptrdiff_t __update = 1)
   {
     _LIBCUDACXX_DEBUG_ASSERT(__update >= 0, "Arrival count update must be non-negative.");
     arrival_token __token = {};
@@ -243,7 +242,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
   }
 
 private:
-  _LIBCUDACXX_INLINE_VISIBILITY inline bool __test_wait_sm_80(arrival_token __token) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_sm_80(arrival_token __token) const
   {
     (void) __token;
     int32_t __ready = 0;
@@ -261,7 +260,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
   }
 
   // Document de drop > uint32_t for __nanosec on public for APIs
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait(arrival_token __token) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token) const
   {
     (void) __token;
     NV_DISPATCH_TARGET(
@@ -289,7 +288,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
   }
 
   // Document de drop > uint32_t for __nanosec on public for APIs
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait(arrival_token __token, _CUDA_VSTD::chrono::nanoseconds __nanosec) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __token, _CUDA_VSTD::chrono::nanoseconds __nanosec) const
   {
     if (__nanosec.count() < 1)
     {
@@ -344,7 +343,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
                 _CUDA_VSTD::chrono::nanoseconds(__nanosec));))
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY inline bool __test_wait_parity_sm_80(bool __phase_parity) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __test_wait_parity_sm_80(bool __phase_parity) const
   {
     (void) __phase_parity;
     uint16_t __ready = 0;
@@ -362,7 +361,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
     return __ready;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait_parity(bool __phase_parity) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity) const
   {
     NV_DISPATCH_TARGET(
       NV_PROVIDES_SM_90,
@@ -391,8 +390,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
       (return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity);))
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  __try_wait_parity(bool __phase_parity, _CUDA_VSTD::chrono::nanoseconds __nanosec) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __phase_parity, _CUDA_VSTD::chrono::nanoseconds __nanosec) const
   {
     if (__nanosec.count() < 1)
     {
@@ -449,24 +447,24 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
   }
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __phase) const
+  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const
   {
     _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
       _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__phase)));
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void wait_parity(bool __phase_parity) const
+  _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __phase_parity) const
   {
     _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
       _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
   }
 
-  inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait()
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
   {
     wait(arrive());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop()
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
   {
     NV_DISPATCH_TARGET(
       NV_PROVIDES_SM_90,
@@ -495,13 +493,13 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
         __barrier.arrive_and_drop();))
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
   {
     return (1 << 20) - 1;
   }
 
   template <class _Rep, class _Period>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY bool
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
   try_wait_for(arrival_token&& __token, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur)
   {
     auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur);
@@ -510,14 +508,14 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
   }
 
   template <class _Clock, class _Duration>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY bool
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
   try_wait_until(arrival_token&& __token, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time)
   {
     return try_wait_for(_CUDA_VSTD::move(__token), (__time - _Clock::now()));
   }
 
   template <class _Rep, class _Period>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY bool
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
   try_wait_parity_for(bool __phase_parity, const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __dur)
   {
     auto __nanosec = _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__dur);
@@ -526,7 +524,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
   }
 
   template <class _Clock, class _Duration>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY bool
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool
   try_wait_parity_until(bool __phase_parity, const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time)
   {
     return try_wait_parity_for(__phase_parity, (__time - _Clock::now()));
@@ -686,7 +684,7 @@ class barrier<thread_scope_thread, _CUDA_VSTD::__empty_completion> : private bar
 public:
   using __base::__base;
 
-  _LIBCUDACXX_INLINE_VISIBILITY friend void
+  _LIBCUDACXX_HIDE_FROM_ABI friend void
   init(barrier* __b,
        _CUDA_VSTD::ptrdiff_t __expected,
        _CUDA_VSTD::__empty_completion __completion = _CUDA_VSTD::__empty_completion())
@@ -702,13 +700,13 @@ class barrier<thread_scope_thread, _CUDA_VSTD::__empty_completion> : private bar
 };
 
 template <typename... _Ty>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool __unused(_Ty...)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __unused(_Ty...)
 {
   return true;
 }
 
 template <typename _Ty>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool __unused(_Ty&)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __unused(_Ty&)
 {
   return true;
 }
@@ -718,20 +716,20 @@ template <thread_scope _Sco,
           typename _CompF,
           bool _Is_mbarrier = (_Sco == thread_scope_block)
                            && _CUDA_VSTD::is_same<_CompF, _CUDA_VSTD::__empty_completion>::value>
-_LIBCUDACXX_INLINE_VISIBILITY bool __is_local_smem_barrier(barrier<_Sco, _CompF>& __barrier)
+_LIBCUDACXX_HIDE_FROM_ABI bool __is_local_smem_barrier(barrier<_Sco, _CompF>& __barrier)
 {
   NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return _Is_mbarrier && __isShared(&__barrier);), (return false;));
 }
 
 // __try_get_barrier_handle returns barrier handle of block-scoped barriers and a nullptr otherwise.
 template <thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_INLINE_VISIBILITY inline _CUDA_VSTD::uint64_t* __try_get_barrier_handle(barrier<_Sco, _CompF>& __barrier)
+_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t* __try_get_barrier_handle(barrier<_Sco, _CompF>& __barrier)
 {
   return nullptr;
 }
 
 template <>
-_LIBCUDACXX_INLINE_VISIBILITY inline _CUDA_VSTD::uint64_t*
+_LIBCUDACXX_HIDE_FROM_ABI _CUDA_VSTD::uint64_t*
 __try_get_barrier_handle<::cuda::thread_scope_block, _CUDA_VSTD::__empty_completion>(
   barrier<::cuda::thread_scope_block>& __barrier)
 {
@@ -750,7 +748,7 @@ __try_get_barrier_handle<::cuda::thread_scope_block, _CUDA_VSTD::__empty_complet
 struct __memcpy_completion_impl
 {
   template <typename _Group>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY static async_contract_fulfillment
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment
   __defer(__completion_mechanism __cm,
           _Group const& __group,
           _CUDA_VSTD::size_t __size,
@@ -805,14 +803,14 @@ struct __memcpy_completion_impl
   }
 
   template <typename _Group, thread_scope _Sco, typename _CompF>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY static async_contract_fulfillment __defer(
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer(
     __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier)
   {
     return __defer_non_smem_barrier(__cm, __group, __size, __barrier);
   }
 
   template <typename _Group, thread_scope _Sco, typename _CompF>
-  _LIBCUDACXX_INLINE_VISIBILITY static async_contract_fulfillment __defer_non_smem_barrier(
+  _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment __defer_non_smem_barrier(
     __completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, barrier<_Sco, _CompF>& __barrier)
   {
     // Overload for non-smem barriers.
@@ -844,7 +842,7 @@ struct __memcpy_completion_impl
   }
 
   template <typename _Group, thread_scope _Sco>
-  _LIBCUDACXX_INLINE_VISIBILITY static async_contract_fulfillment
+  _LIBCUDACXX_HIDE_FROM_ABI static async_contract_fulfillment
   __defer(__completion_mechanism __cm, _Group const& __group, _CUDA_VSTD::size_t __size, pipeline<_Sco>& __pipeline)
   {
     // pipeline does not sync on memcpy_async, defeat pipeline purpose otherwise
@@ -1139,7 +1137,7 @@ _CCCL_NODISCARD _CCCL_DEVICE inline __completion_mechanism __dispatch_memcpy_asy
 
 // __dispatch_memcpy_async is the internal entry point for dispatching to the correct memcpy_async implementation.
 template <_CUDA_VSTD::size_t _Align, typename _Group>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY __completion_mechanism __dispatch_memcpy_async(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async(
   _Group const& __group,
   char* __dest_char,
   char const* __src_char,
@@ -1174,7 +1172,7 @@ _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY __completion_mechanism __dispatch_
 }
 
 template <_CUDA_VSTD::size_t _Align, typename _Group>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY __completion_mechanism __dispatch_memcpy_async(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memcpy_async(
   _Group const& __group,
   char* __dest_char,
   char const* __src_char,
@@ -1190,19 +1188,19 @@ _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY __completion_mechanism __dispatch_
 
 struct __single_thread_group
 {
-  _LIBCUDACXX_INLINE_VISIBILITY void sync() const {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _CUDA_VSTD::size_t size() const
+  _LIBCUDACXX_HIDE_FROM_ABI void sync() const {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t size() const
   {
     return 1;
   };
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _CUDA_VSTD::size_t thread_rank() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::size_t thread_rank() const
   {
     return 0;
   };
 };
 
 template <typename _Group, class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment __memcpy_async_barrier(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment __memcpy_async_barrier(
   _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
   static_assert(_CUDA_VSTD::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type");
@@ -1235,7 +1233,7 @@ _LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment __memcpy_async_barrier(
 }
 
 template <typename _Group, class _Tp, _CUDA_VSTD::size_t _Alignment, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group,
   _Tp* __destination,
   _Tp const* __source,
@@ -1246,14 +1244,14 @@ _LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
 }
 
 template <class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(_Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
   return __memcpy_async_barrier(__single_thread_group{}, __destination, __source, __size, __barrier);
 }
 
 template <typename _Group, class _Tp, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group,
   _Tp* __destination,
   _Tp const* __source,
@@ -1264,7 +1262,7 @@ _LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
 }
 
 template <typename _Group, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group,
   void* __destination,
   void const* __source,
@@ -1276,7 +1274,7 @@ _LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
 }
 
 template <typename _Group, _CUDA_VSTD::size_t _Alignment, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment memcpy_async(
   _Group const& __group,
   void* __destination,
   void const* __source,
@@ -1288,7 +1286,7 @@ _LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
 }
 
 template <typename _Size, thread_scope _Sco, typename _CompF>
-_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment
+_LIBCUDACXX_HIDE_FROM_ABI async_contract_fulfillment
 memcpy_async(void* __destination, void const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
   return __memcpy_async_barrier(
diff --git a/libcudacxx/include/cuda/std/__cuda/chrono.h b/libcudacxx/include/cuda/std/__cuda/chrono.h
index 7a7f40c3f4..1b3110a556 100644
--- a/libcudacxx/include/cuda/std/__cuda/chrono.h
+++ b/libcudacxx/include/cuda/std/__cuda/chrono.h
@@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 namespace chrono
 {
 
-inline _LIBCUDACXX_INLINE_VISIBILITY system_clock::time_point system_clock::now() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI system_clock::time_point system_clock::now() noexcept
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
@@ -49,12 +49,12 @@ inline _LIBCUDACXX_INLINE_VISIBILITY system_clock::time_point system_clock::now(
         .count())));));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY time_t system_clock::to_time_t(const system_clock::time_point& __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI time_t system_clock::to_time_t(const system_clock::time_point& __t) noexcept
 {
   return time_t(duration_cast<seconds>(__t.time_since_epoch()).count());
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY system_clock::time_point system_clock::from_time_t(time_t __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI system_clock::time_point system_clock::from_time_t(time_t __t) noexcept
 {
   return time_point(seconds(__t));
   ;
diff --git a/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h b/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h
index d8cf1c8765..c4b53cbb21 100644
--- a/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h
+++ b/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h
@@ -35,68 +35,68 @@ _CCCL_DIAG_POP
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // trigonometric functions
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 sin(__nv_bfloat16 __v)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 sin(__nv_bfloat16 __v)
 {
   NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsin(__v);), (return __float2bfloat16(::sinf(__bfloat162float(__v)));))
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 sinh(__nv_bfloat16 __v)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 sinh(__nv_bfloat16 __v)
 {
   return __float2bfloat16(::sinhf(__bfloat162float(__v)));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 cos(__nv_bfloat16 __v)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 cos(__nv_bfloat16 __v)
 {
   NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hcos(__v);), (return __float2bfloat16(::cosf(__bfloat162float(__v)));))
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 cosh(__nv_bfloat16 __v)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 cosh(__nv_bfloat16 __v)
 {
   return __float2bfloat16(::coshf(__bfloat162float(__v)));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 exp(__nv_bfloat16 __v)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 exp(__nv_bfloat16 __v)
 {
   NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hexp(__v);), (return __float2bfloat16(::expf(__bfloat162float(__v)));))
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 hypot(__nv_bfloat16 __x, __nv_bfloat16 __y)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 hypot(__nv_bfloat16 __x, __nv_bfloat16 __y)
 {
   return __float2bfloat16(::hypotf(__bfloat162float(__x), __bfloat162float(__y)));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 atan2(__nv_bfloat16 __x, __nv_bfloat16 __y)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 atan2(__nv_bfloat16 __x, __nv_bfloat16 __y)
 {
   return __float2bfloat16(::atan2f(__bfloat162float(__x), __bfloat162float(__y)));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 log(__nv_bfloat16 __x)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log(__nv_bfloat16 __x)
 {
   NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hlog(__x);), (return __float2bfloat16(::logf(__bfloat162float(__x)));))
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 sqrt(__nv_bfloat16 __x)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 sqrt(__nv_bfloat16 __x)
 {
   NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2bfloat16(::sqrtf(__bfloat162float(__x)));))
 }
 
 // floating point helper
-inline _LIBCUDACXX_INLINE_VISIBILITY bool signbit(__nv_bfloat16 __v)
+_LIBCUDACXX_HIDE_FROM_ABI bool signbit(__nv_bfloat16 __v)
 {
   return ::signbit(::__bfloat162float(__v));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isnan(__nv_bfloat16 __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isnan(__nv_bfloat16 __x) noexcept
 {
   return ::__hisnan(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isnan(__nv_bfloat16 __v)
+_LIBCUDACXX_HIDE_FROM_ABI bool isnan(__nv_bfloat16 __v)
 {
   return __constexpr_isnan(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isinf(__nv_bfloat16 __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isinf(__nv_bfloat16 __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2020 && defined(_CCCL_CUDACC_BELOW_12_3)
   // this is a workaround for nvbug 4362808
@@ -106,47 +106,47 @@ inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isinf(__nv_bfloat16 __x) n
 #  endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_VER < 1203000
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isinf(__nv_bfloat16 __v)
+_LIBCUDACXX_HIDE_FROM_ABI bool isinf(__nv_bfloat16 __v)
 {
   return __constexpr_isinf(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isfinite(__nv_bfloat16 __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isfinite(__nv_bfloat16 __x) noexcept
 {
   return !__constexpr_isnan(__x) && !__constexpr_isinf(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isfinite(__nv_bfloat16 __v)
+_LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__nv_bfloat16 __v)
 {
   return __constexpr_isfinite(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
 {
   return __float2bfloat16(::copysignf(__bfloat162float(__x), __bfloat162float(__y)));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 copysign(__nv_bfloat16 __x, __nv_bfloat16 __y)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 copysign(__nv_bfloat16 __x, __nv_bfloat16 __y)
 {
   return __constexpr_copysign(__x, __y);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __constexpr_fabs(__nv_bfloat16 __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_fabs(__nv_bfloat16 __x) noexcept
 {
   return ::__habs(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 fabs(__nv_bfloat16 __x)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 fabs(__nv_bfloat16 __x)
 {
   return __constexpr_fabs(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 abs(__nv_bfloat16 __x)
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 abs(__nv_bfloat16 __x)
 {
   return __constexpr_fabs(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __constexpr_fmax(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_fmax(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
 {
   return ::__hmax(__x, __y);
 }
diff --git a/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h b/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h
index 9d3be5ab3b..2ee22990c8 100644
--- a/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h
+++ b/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h
@@ -32,7 +32,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // trigonometric functions
-inline _LIBCUDACXX_INLINE_VISIBILITY __half sin(__half __v)
+_LIBCUDACXX_HIDE_FROM_ABI __half sin(__half __v)
 {
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (return ::hsin(__v);), ({
                       float __vf            = __half2float(__v);
@@ -59,13 +59,13 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half sin(__half __v)
                     }))
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half sinh(__half __v)
+_LIBCUDACXX_HIDE_FROM_ABI __half sinh(__half __v)
 {
   return __float2half(::sinhf(__half2float(__v)));
 }
 
 // clang-format off
-inline _LIBCUDACXX_INLINE_VISIBILITY __half cos(__half __v)
+_LIBCUDACXX_HIDE_FROM_ABI  __half cos(__half __v)
 {
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (
     return ::hcos(__v);
@@ -92,13 +92,13 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half cos(__half __v)
 }
 // clang-format on
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half cosh(__half __v)
+_LIBCUDACXX_HIDE_FROM_ABI __half cosh(__half __v)
 {
   return __float2half(::coshf(__half2float(__v)));
 }
 
 // clang-format off
-inline _LIBCUDACXX_INLINE_VISIBILITY __half exp(__half __v)
+_LIBCUDACXX_HIDE_FROM_ABI  __half exp(__half __v)
 {
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (
     return ::hexp(__v);
@@ -125,18 +125,18 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half exp(__half __v)
 }
 // clang-format on
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half hypot(__half __x, __half __y)
+_LIBCUDACXX_HIDE_FROM_ABI __half hypot(__half __x, __half __y)
 {
   return __float2half(::hypotf(__half2float(__x), __half2float(__y)));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half atan2(__half __x, __half __y)
+_LIBCUDACXX_HIDE_FROM_ABI __half atan2(__half __x, __half __y)
 {
   return __float2half(::atan2f(__half2float(__x), __half2float(__y)));
 }
 
 // clang-format off
-inline _LIBCUDACXX_INLINE_VISIBILITY __half log(__half __x)
+_LIBCUDACXX_HIDE_FROM_ABI  __half log(__half __x)
 {
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (
     return ::hlog(__x);
@@ -162,28 +162,28 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __half log(__half __x)
 }
 // clang-format on
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half sqrt(__half __x)
+_LIBCUDACXX_HIDE_FROM_ABI __half sqrt(__half __x)
 {
   NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2half(::sqrtf(__half2float(__x)));))
 }
 
 // floating point helper
-inline _LIBCUDACXX_INLINE_VISIBILITY bool signbit(__half __v)
+_LIBCUDACXX_HIDE_FROM_ABI bool signbit(__half __v)
 {
   return ::signbit(::__half2float(__v));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isnan(__half __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isnan(__half __x) noexcept
 {
   return ::__hisnan(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isnan(__half __v)
+_LIBCUDACXX_HIDE_FROM_ABI bool isnan(__half __v)
 {
   return __constexpr_isnan(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isinf(__half __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isinf(__half __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2020 && defined(_CCCL_CUDACC_BELOW_12_3)
   // this is a workaround for nvbug 4362808
@@ -193,47 +193,47 @@ inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isinf(__half __x) noexcept
 #  endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_VER < 1203000
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isinf(__half __v)
+_LIBCUDACXX_HIDE_FROM_ABI bool isinf(__half __v)
 {
   return __constexpr_isinf(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isfinite(__half __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isfinite(__half __x) noexcept
 {
   return !__constexpr_isnan(__x) && !__constexpr_isinf(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isfinite(__half __v)
+_LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__half __v)
 {
   return __constexpr_isfinite(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half __constexpr_copysign(__half __x, __half __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_copysign(__half __x, __half __y) noexcept
 {
   return __float2half(::copysignf(__half2float(__x), __half2float(__y)));
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half copysign(__half __x, __half __y)
+_LIBCUDACXX_HIDE_FROM_ABI __half copysign(__half __x, __half __y)
 {
   return __constexpr_copysign(__x, __y);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half __constexpr_fabs(__half __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_fabs(__half __x) noexcept
 {
   return ::__habs(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half fabs(__half __x)
+_LIBCUDACXX_HIDE_FROM_ABI __half fabs(__half __x)
 {
   return __constexpr_fabs(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half abs(__half __x)
+_LIBCUDACXX_HIDE_FROM_ABI __half abs(__half __x)
 {
   return __constexpr_fabs(__x);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __half __constexpr_fmax(__half __x, __half __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_fmax(__half __x, __half __y) noexcept
 {
   return ::__hmax(__x, __y);
 }
diff --git a/libcudacxx/include/cuda/std/__cuda/latch.h b/libcudacxx/include/cuda/std/__cuda/latch.h
index 480462802a..6d13ecf56d 100644
--- a/libcudacxx/include/cuda/std/__cuda/latch.h
+++ b/libcudacxx/include/cuda/std/__cuda/latch.h
@@ -27,7 +27,7 @@ template <thread_scope _Sco>
 class latch : public _CUDA_VSTD::__latch_base<_Sco>
 {
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr latch(_CUDA_VSTD::ptrdiff_t __count)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr latch(_CUDA_VSTD::ptrdiff_t __count)
       : _CUDA_VSTD::__latch_base<_Sco>(__count)
   {}
 };
diff --git a/libcudacxx/include/cuda/std/__cuda/semaphore.h b/libcudacxx/include/cuda/std/__cuda/semaphore.h
index 691efeef8f..5de267472e 100644
--- a/libcudacxx/include/cuda/std/__cuda/semaphore.h
+++ b/libcudacxx/include/cuda/std/__cuda/semaphore.h
@@ -29,10 +29,10 @@ class counting_semaphore : public _CUDA_VSTD::__semaphore_base<__least_max_value
   static_assert(__least_max_value <= _CUDA_VSTD::__semaphore_base<__least_max_value, _Sco>::max(), "");
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr counting_semaphore(ptrdiff_t __count = 0)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr counting_semaphore(ptrdiff_t __count = 0)
       : _CUDA_VSTD::__semaphore_base<__least_max_value, _Sco>(__count)
   {}
-  ~counting_semaphore() = default;
+  _CCCL_HIDE_FROM_ABI ~counting_semaphore() = default;
 
   counting_semaphore(const counting_semaphore&)            = delete;
   counting_semaphore& operator=(const counting_semaphore&) = delete;
diff --git a/libcudacxx/include/cuda/std/__exception/cuda_error.h b/libcudacxx/include/cuda/std/__exception/cuda_error.h
index fc4a44b258..252fb64270 100644
--- a/libcudacxx/include/cuda/std/__exception/cuda_error.h
+++ b/libcudacxx/include/cuda/std/__exception/cuda_error.h
@@ -41,7 +41,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
  * @brief Exception thrown when a CUDA error is encountered.
  */
 
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
 class cuda_error : public ::std::runtime_error
 {
 private:
@@ -62,24 +62,24 @@ class cuda_error : public ::std::runtime_error
   {}
 };
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_cuda_error(::cudaError_t __status, const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t __status, const char* __msg)
 {
   NV_IF_ELSE_TARGET(NV_IS_HOST,
                     (throw ::cuda::cuda_error(__status, __msg);),
                     ((void) __status; (void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
 }
-#else // ^^^ !_LIBCUDACXX_NO_EXCEPTIONS ^^^ / vvv _LIBCUDACXX_NO_EXCEPTIONS vvv
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
 class cuda_error
 {
 public:
   cuda_error(::cudaError_t, const char*) noexcept {}
 };
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_cuda_error(::cudaError_t, const char*)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_cuda_error(::cudaError_t, const char*)
 {
   _CUDA_VSTD_NOVERSION::terminate();
 }
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
diff --git a/libcudacxx/include/cuda/std/__exception/terminate.h b/libcudacxx/include/cuda/std/__exception/terminate.h
index 6a847918cf..032a999562 100644
--- a/libcudacxx/include/cuda/std/__exception/terminate.h
+++ b/libcudacxx/include/cuda/std/__exception/terminate.h
@@ -26,16 +26,16 @@
 #  include <exception>
 #endif // !_CCCL_COMPILER_NVRTC
 
-#include <cuda/std/detail/libcxx/include/cstdlib>
+#include <cuda/std/cstdlib>
 
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION // purposefully not using versioning namespace
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __cccl_terminate() noexcept
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __cccl_terminate() noexcept
 {
-  NV_IF_ELSE_TARGET(NV_IS_HOST, (::std::exit(0);), (__trap();))
+  NV_IF_ELSE_TARGET(NV_IS_HOST, (::std::exit(-1);), (__trap();))
   _LIBCUDACXX_UNREACHABLE();
 }
 
@@ -49,18 +49,18 @@ __device__
   static _LIBCUDACXX_SAFE_STATIC _CUDA_VSTD::atomic<terminate_handler>
     __cccl_terminate_handler{&__cccl_terminate};
 
-inline _LIBCUDACXX_INLINE_VISIBILITY terminate_handler set_terminate(terminate_handler __func) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI  terminate_handler set_terminate(terminate_handler __func) noexcept
 {
   return __cccl_terminate_handler.exchange(__func);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY terminate_handler get_terminate() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI  terminate_handler get_terminate() noexcept
 {
   return __cccl_terminate_handler.load(__func);
 }
 
 #endif
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void terminate() noexcept
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void terminate() noexcept
 {
   __cccl_terminate();
   _LIBCUDACXX_UNREACHABLE();
diff --git a/libcudacxx/include/cuda/std/__expected/bad_expected_access.h b/libcudacxx/include/cuda/std/__expected/bad_expected_access.h
index b3941d45a4..5600402e42 100644
--- a/libcudacxx/include/cuda/std/__expected/bad_expected_access.h
+++ b/libcudacxx/include/cuda/std/__expected/bad_expected_access.h
@@ -27,17 +27,17 @@
 
 #if _CCCL_STD_VER > 2011
 
-#  ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#  ifndef _CCCL_NO_EXCEPTIONS
 #    ifdef __cpp_lib_expected
 #      include <expected>
 #    else // ^^^ __cpp_lib_expected ^^^ / vvv !__cpp_lib_expected vvv
 #      include <exception>
 #    endif // !__cpp_lib_expected
-#  endif // _LIBCUDACXX_NO_EXCEPTIONS
+#  endif // _CCCL_NO_EXCEPTIONS
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#  ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#  ifndef _CCCL_NO_EXCEPTIONS
 
 #    ifdef __cpp_lib_expected
 
@@ -52,12 +52,12 @@ template <>
 class bad_expected_access<void> : public ::std::exception
 {
 protected:
-  bad_expected_access() noexcept                             = default;
-  bad_expected_access(const bad_expected_access&)            = default;
-  bad_expected_access(bad_expected_access&&)                 = default;
-  bad_expected_access& operator=(const bad_expected_access&) = default;
-  bad_expected_access& operator=(bad_expected_access&&)      = default;
-  ~bad_expected_access() noexcept override                   = default;
+  _CCCL_HIDE_FROM_ABI bad_expected_access() noexcept                             = default;
+  _CCCL_HIDE_FROM_ABI bad_expected_access(const bad_expected_access&)            = default;
+  _CCCL_HIDE_FROM_ABI bad_expected_access(bad_expected_access&&)                 = default;
+  _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(const bad_expected_access&) = default;
+  _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(bad_expected_access&&)      = default;
+  ~bad_expected_access() noexcept override                                       = default;
 
 public:
   // The way this has been designed (by using a class template below) means that we'll already
@@ -78,22 +78,22 @@ class bad_expected_access : public bad_expected_access<void>
       : __unex_(_CUDA_VSTD::move(__e))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Err& error() & noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Err& error() & noexcept
   {
     return __unex_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY const _Err& error() const& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI const _Err& error() const& noexcept
   {
     return __unex_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Err&& error() && noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Err&& error() && noexcept
   {
     return _CUDA_VSTD::move(__unex_);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY const _Err&& error() const&& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI const _Err&& error() const&& noexcept
   {
     return _CUDA_VSTD::move(__unex_);
   }
@@ -103,19 +103,19 @@ class bad_expected_access : public bad_expected_access<void>
 };
 #    endif // !__cpp_lib_expected
 
-#  endif // _LIBCUDACXX_NO_EXCEPTIONS
+#  endif // !_CCCL_NO_EXCEPTIONS
 
 template <class _Err, class _Arg>
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_bad_expected_access(_Arg&& __arg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_bad_expected_access(_Arg&& __arg)
 {
-#  ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#  ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST,
                     (throw _CUDA_VSTD::bad_expected_access<_Err>(_CUDA_VSTD::forward<_Arg>(__arg));),
                     ((void) __arg; _CUDA_VSTD_NOVERSION::terminate();))
-#  else
+#  else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __arg;
   _CUDA_VSTD_NOVERSION::terminate();
-#  endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#  endif // _CCCL_NO_EXCEPTIONS
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__expected/expected.h b/libcudacxx/include/cuda/std/__expected/expected.h
index ab63a50fb7..42f0e5b223 100644
--- a/libcudacxx/include/cuda/std/__expected/expected.h
+++ b/libcudacxx/include/cuda/std/__expected/expected.h
@@ -60,8 +60,8 @@
 #include <cuda/std/__utility/in_place.h>
 #include <cuda/std/__utility/move.h>
 #include <cuda/std/__utility/swap.h>
+#include <cuda/std/cstdlib>
 #include <cuda/std/detail/libcxx/include/__assert>
-#include <cuda/std/detail/libcxx/include/cstdlib>
 #include <cuda/std/initializer_list>
 
 #if _CCCL_STD_VER > 2011
@@ -128,15 +128,14 @@ class expected : private __expected_move_assign<_Tp, _Err>
   // [expected.object.ctor], constructors
   _LIBCUDACXX_TEMPLATE(class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_default_constructible, _Tp2))
-  _LIBCUDACXX_HIDE_FROM_ABI
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr expected() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr expected() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2))
       : __base(true)
   {}
 
-  constexpr expected(const expected&)            = default;
-  constexpr expected(expected&&)                 = default;
-  constexpr expected& operator=(const expected&) = default;
-  constexpr expected& operator=(expected&&)      = default;
+  _CCCL_HIDE_FROM_ABI constexpr expected(const expected&)            = default;
+  _CCCL_HIDE_FROM_ABI constexpr expected(expected&&)                 = default;
+  _CCCL_HIDE_FROM_ABI constexpr expected& operator=(const expected&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr expected& operator=(expected&&)      = default;
 
 private:
   template <class _Up, class _OtherErr, class _UfQual, class _OtherErrQual>
@@ -160,8 +159,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Up, class _OtherErr)
   _LIBCUDACXX_REQUIRES(__can_convert<_Up, _OtherErr, const _Up&, const _OtherErr&>::value _LIBCUDACXX_AND _CCCL_TRAIT(
     is_convertible, const _Up&, _Tp) _LIBCUDACXX_AND _CCCL_TRAIT(is_convertible, const _OtherErr&, _Err))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  expected(const expected<_Up, _OtherErr>& __other) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 expected(const expected<_Up, _OtherErr>& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, const _Up&)
     && _CCCL_TRAIT(is_nothrow_constructible, _Err, const _OtherErr&)) // strengthened
       : __base(__other.__has_val_)
@@ -179,8 +177,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Up, class _OtherErr)
   _LIBCUDACXX_REQUIRES(__can_convert<_Up, _OtherErr, const _Up&, const _OtherErr&>::value _LIBCUDACXX_AND(
     !_CCCL_TRAIT(is_convertible, const _Up&, _Tp) || !_CCCL_TRAIT(is_convertible, const _OtherErr&, _Err)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX20 explicit expected(const expected<_Up, _OtherErr>& __other) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit expected(const expected<_Up, _OtherErr>& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, const _Up&)
     && _CCCL_TRAIT(is_nothrow_constructible, _Err, const _OtherErr&)) // strengthened
       : __base(__other.__has_val_)
@@ -198,8 +195,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Up, class _OtherErr)
   _LIBCUDACXX_REQUIRES(__can_convert<_Up, _OtherErr, _Up, _OtherErr>::value _LIBCUDACXX_AND _CCCL_TRAIT(
     is_convertible, _Up, _Tp) _LIBCUDACXX_AND _CCCL_TRAIT(is_convertible, _OtherErr, _Err))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  expected(expected<_Up, _OtherErr>&& __other) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 expected(expected<_Up, _OtherErr>&& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Up)
     && _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
       : __base(__other.__has_val_)
@@ -217,8 +213,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Up, class _OtherErr)
   _LIBCUDACXX_REQUIRES(__can_convert<_Up, _OtherErr, _Up, _OtherErr>::value _LIBCUDACXX_AND(
     !_CCCL_TRAIT(is_convertible, _Up, _Tp) || !_CCCL_TRAIT(is_convertible, _OtherErr, _Err)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX20 explicit expected(expected<_Up, _OtherErr>&& __other) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit expected(expected<_Up, _OtherErr>&& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Up)
     && _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
       : __base(__other.__has_val_)
@@ -238,7 +233,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
     is_same, expected, __remove_cvref_t<_Up>)) _LIBCUDACXX_AND(!__unexpected::__is_unexpected<__remove_cvref_t<_Up>>)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, _Tp, _Up)
                            _LIBCUDACXX_AND _CCCL_TRAIT(is_convertible, _Up, _Tp))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr expected(_Up&& __u) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr expected(_Up&& __u) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Up)) // strengthened
       : __base(in_place, _CUDA_VSTD::forward<_Up>(__u))
   {}
@@ -248,7 +243,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
     is_same, expected, __remove_cvref_t<_Up>)) _LIBCUDACXX_AND(!__unexpected::__is_unexpected<__remove_cvref_t<_Up>>)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, _Tp, _Up)
                            _LIBCUDACXX_AND(!_CCCL_TRAIT(is_convertible, _Up, _Tp)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(_Up&& __u) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(_Up&& __u) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Up)) // strengthened
       : __base(in_place, _CUDA_VSTD::forward<_Up>(__u))
   {}
@@ -256,8 +251,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, const _OtherErr&)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_convertible, const _OtherErr&, _Err))
-  _LIBCUDACXX_HIDE_FROM_ABI
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr expected(const unexpected<_OtherErr>& __unex) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr expected(const unexpected<_OtherErr>& __unex) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, const _OtherErr&)) // strengthened
       : __base(unexpect, __unex.error())
   {}
@@ -265,8 +259,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, const _OtherErr&)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_convertible, const _OtherErr&, _Err)))
-  _LIBCUDACXX_HIDE_FROM_ABI
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(const unexpected<_OtherErr>& __unex) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(const unexpected<_OtherErr>& __unex) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, const _OtherErr&)) // strengthened
       : __base(unexpect, __unex.error())
   {}
@@ -274,7 +267,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _OtherErr)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_convertible, _OtherErr, _Err))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr expected(unexpected<_OtherErr>&& __unex) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr expected(unexpected<_OtherErr>&& __unex) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
       : __base(unexpect, _CUDA_VSTD::move(__unex.error()))
   {}
@@ -282,21 +275,21 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _OtherErr)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_convertible, _OtherErr, _Err)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(
-    unexpected<_OtherErr>&& __unex) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(unexpected<_OtherErr>&& __unex) noexcept(
+    _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
       : __base(unexpect, _CUDA_VSTD::move(__unex.error()))
   {}
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Tp, _Args...))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(
-    in_place_t, _Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...)) // strengthened
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(in_place_t, _Args&&... __args) noexcept(
+    _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...)) // strengthened
       : __base(in_place, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Up, class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Tp, initializer_list<_Up>&, _Args...))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(
     in_place_t,
     initializer_list<_Up> __il,
     _Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible,
@@ -308,14 +301,14 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _Args...))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(
-    unexpect_t, _Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...)) // strengthened
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(unexpect_t, _Args&&... __args) noexcept(
+    _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...)) // strengthened
       : __base(unexpect, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Up, class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, initializer_list<_Up>&, _Args...))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(
     unexpect_t,
     initializer_list<_Up> __il,
     _Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible,
@@ -327,7 +320,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
 private:
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr expected(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr expected(
     __expected_construct_from_invoke_tag,
     in_place_t,
     _Fun&& __fun,
@@ -339,7 +332,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr expected(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr expected(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -360,7 +353,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
           _LIBCUDACXX_AND(_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Up)
                           || _CCCL_TRAIT(is_nothrow_move_constructible, _Tp)
                           || _CCCL_TRAIT(is_nothrow_move_constructible, _Err)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 expected& operator=(_Up&& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 expected& operator=(_Up&& __v)
   {
     if (this->__has_val_)
     {
@@ -387,8 +380,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 public:
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(__can_assign_from_unexpected<const _OtherErr&>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 expected&
-  operator=(const unexpected<_OtherErr>& __un)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 expected& operator=(const unexpected<_OtherErr>& __un)
   {
     if (this->__has_val_)
     {
@@ -404,8 +396,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(__can_assign_from_unexpected<_OtherErr>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 expected&
-  operator=(unexpected<_OtherErr>&& __un)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 expected& operator=(unexpected<_OtherErr>&& __un)
   {
     if (this->__has_val_)
     {
@@ -421,7 +412,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _Tp& emplace(_Args&&... __args) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Tp& emplace(_Args&&... __args) noexcept
   {
     if (this->__has_val_)
     {
@@ -437,8 +428,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Up, class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_nothrow_constructible, _Tp, initializer_list<_Up>&, _Args...))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _Tp&
-  emplace(initializer_list<_Up> __il, _Args&&... __args) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) noexcept
   {
     if (this->__has_val_)
     {
@@ -456,8 +446,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   // [expected.object.swap], swap
   _LIBCUDACXX_TEMPLATE(class _Tp2 = _Tp, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(__expected::__can_swap<_Tp2, _Err2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
-  swap(expected<_Tp2, _Err>& __rhs) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void swap(expected<_Tp2, _Err>& __rhs) noexcept(
     _CCCL_TRAIT(is_nothrow_move_constructible, _Tp2) && _CCCL_TRAIT(is_nothrow_swappable, _Tp2)
     && _CCCL_TRAIT(is_nothrow_move_constructible, _Err) && _CCCL_TRAIT(is_nothrow_swappable, _Err))
   {
@@ -488,8 +477,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
   template <class _Tp2 = _Tp, class _Err2 = _Err>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 auto
-  swap(expected& __x, expected& __y) noexcept(
+  friend _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 auto swap(expected& __x, expected& __y) noexcept(
     _CCCL_TRAIT(is_nothrow_move_constructible, _Tp2) && _CCCL_TRAIT(is_nothrow_swappable, _Tp2)
     && _CCCL_TRAIT(is_nothrow_move_constructible, _Err2) && _CCCL_TRAIT(is_nothrow_swappable, _Err2))
     _LIBCUDACXX_TRAILING_REQUIRES(void)(__expected::__can_swap<_Tp2, _Err2>)
@@ -498,53 +486,53 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
   // [expected.object.obs], observers
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp* operator->() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp* operator->() const noexcept
   {
     _LIBCUDACXX_ASSERT(this->__has_val_, "expected::operator-> requires the expected to contain a value");
     return _CUDA_VSTD::addressof(this->__union_.__val_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* operator->() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp* operator->() noexcept
   {
     _LIBCUDACXX_ASSERT(this->__has_val_, "expected::operator-> requires the expected to contain a value");
     return _CUDA_VSTD::addressof(this->__union_.__val_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& operator*() const& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& operator*() const& noexcept
   {
     _LIBCUDACXX_ASSERT(this->__has_val_, "expected::operator* requires the expected to contain a value");
     return this->__union_.__val_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp& operator*() & noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& operator*() & noexcept
   {
     _LIBCUDACXX_ASSERT(this->__has_val_, "expected::operator* requires the expected to contain a value");
     return this->__union_.__val_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp&& operator*() const&& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp&& operator*() const&& noexcept
   {
     _LIBCUDACXX_ASSERT(this->__has_val_, "expected::operator* requires the expected to contain a value");
     return _CUDA_VSTD::move(this->__union_.__val_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp&& operator*() && noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& operator*() && noexcept
   {
     _LIBCUDACXX_ASSERT(this->__has_val_, "expected::operator* requires the expected to contain a value");
     return _CUDA_VSTD::move(this->__union_.__val_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit operator bool() const noexcept
   {
     return this->__has_val_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool has_value() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool has_value() const noexcept
   {
     return this->__has_val_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp& value() const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& value() const&
   {
     static_assert(_CCCL_TRAIT(is_copy_constructible, _Err),
                   "expected::value() const& requires is_copy_constructible_v<E>");
@@ -555,7 +543,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
     return this->__union_.__val_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp& value() &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& value() &
   {
     static_assert(_CCCL_TRAIT(is_copy_constructible, _Err), "expected::value() & requires is_copy_constructible_v<E>");
     if (!this->__has_val_)
@@ -565,7 +553,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
     return this->__union_.__val_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp&& value() const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp&& value() const&&
   {
     static_assert(_CCCL_TRAIT(is_copy_constructible, _Err),
                   "expected::value() const&& requires is_copy_constructible_v<E>");
@@ -578,7 +566,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
     return _CUDA_VSTD::move(this->__union_.__val_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp&& value() &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& value() &&
   {
     static_assert(_CCCL_TRAIT(is_copy_constructible, _Err), "expected::value() && requires is_copy_constructible_v<E>");
     static_assert(_CCCL_TRAIT(is_constructible, _Err, decltype(_CUDA_VSTD::move(error()))),
@@ -590,32 +578,32 @@ class expected : private __expected_move_assign<_Tp, _Err>
     return _CUDA_VSTD::move(this->__union_.__val_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Err& error() const& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Err& error() const& noexcept
   {
     _LIBCUDACXX_ASSERT(!this->__has_val_, "expected::error requires the expected to contain an error");
     return this->__union_.__unex_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Err& error() & noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Err& error() & noexcept
   {
     _LIBCUDACXX_ASSERT(!this->__has_val_, "expected::error requires the expected to contain an error");
     return this->__union_.__unex_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Err&& error() const&& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Err&& error() const&& noexcept
   {
     _LIBCUDACXX_ASSERT(!this->__has_val_, "expected::error requires the expected to contain an error");
     return _CUDA_VSTD::move(this->__union_.__unex_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Err&& error() && noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Err&& error() && noexcept
   {
     _LIBCUDACXX_ASSERT(!this->__has_val_, "expected::error requires the expected to contain an error");
     return _CUDA_VSTD::move(this->__union_.__unex_);
   }
 
   template <class _Up>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp value_or(_Up&& __v) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) const&
   {
     static_assert(_CCCL_TRAIT(is_copy_constructible, _Tp), "value_type has to be copy constructible");
     static_assert(_CCCL_TRAIT(is_convertible, _Up, _Tp), "argument has to be convertible to value_type");
@@ -623,7 +611,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
   template <class _Up>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp value_or(_Up&& __v) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) &&
   {
     static_assert(_CCCL_TRAIT(is_move_constructible, _Tp), "value_type has to be move constructible");
     static_assert(_CCCL_TRAIT(is_convertible, _Up, _Tp), "argument has to be convertible to value_type");
@@ -633,7 +621,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   // [expected.object.monadic]
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, _Err2&))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto and_then(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto and_then(_Fun&& __fun) &
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, _Tp&>>;
 
@@ -653,7 +641,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_copy_constructible, _Err2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto and_then(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto and_then(_Fun&& __fun) const&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, const _Tp&>>;
 
@@ -673,7 +661,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_move_constructible, _Err2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto and_then(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto and_then(_Fun&& __fun) &&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, _Tp>>;
 
@@ -693,7 +681,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, const _Err2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto and_then(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto and_then(_Fun&& __fun) const&&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, const _Tp>>;
 
@@ -713,7 +701,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Tp2, _Tp2&))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto or_else(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto or_else(_Fun&& __fun) &
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, _Err&>>;
 
@@ -734,7 +722,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_copy_constructible, _Tp2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto or_else(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto or_else(_Fun&& __fun) const&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, const _Err&>>;
 
@@ -755,7 +743,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_move_constructible, _Tp2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto or_else(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto or_else(_Fun&& __fun) &&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, _Err>>;
 
@@ -776,7 +764,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Tp2, const _Tp2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto or_else(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto or_else(_Fun&& __fun) const&&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, const _Err>>;
 
@@ -798,7 +786,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, _Err2&)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun, _Tp2&>>, void))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) &
   {
     static_assert(invocable<_Fun, _Tp&>, "std::expected::transform requires that F must be invocable with T.");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, _Tp&>>;
@@ -817,7 +805,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, _Err2&)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun, _Tp2&>>, void)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) &
   {
     static_assert(invocable<_Fun, _Tp&>, "std::expected::transform requires that F must be invocable with T.");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, _Tp&>>;
@@ -843,7 +831,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_copy_constructible, _Err2)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun, const _Tp2&>>, void))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) const&
   {
     static_assert(invocable<_Fun, const _Tp&>, "std::expected::transform requires that F must be invocable with T.");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, const _Tp&>>;
@@ -862,7 +850,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_copy_constructible, _Err2) _LIBCUDACXX_AND(
     !_CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun, const _Tp2&>>, void)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) const&
   {
     static_assert(invocable<_Fun, const _Tp&>, "std::expected::transform requires that F must be invocable with T");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, const _Tp&>>;
@@ -888,7 +876,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_move_constructible, _Err2)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun, _Tp2>>, void))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) &&
   {
     static_assert(invocable<_Fun, _Tp>, "std::expected::transform requires that F must be invocable with T.");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, _Tp>>;
@@ -906,7 +894,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_move_constructible, _Err2)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun, _Tp2>>, void)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) &&
   {
     static_assert(invocable<_Fun, _Tp>, "std::expected::transform requires that F must be invocable with T");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, _Tp>>;
@@ -935,7 +923,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, const _Err2)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun, const _Tp2>>, void))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) const&&
   {
     static_assert(invocable<_Fun, const _Tp>, "std::expected::transform requires that F must be invocable with T.");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, const _Tp>>;
@@ -954,7 +942,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, const _Err2)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun, const _Tp2>>, void)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) const&&
   {
     static_assert(invocable<_Fun, const _Tp>, "std::expected::transform requires that F must be invocable with T");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, const _Tp>>;
@@ -982,7 +970,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Tp2, _Tp2&))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform_error(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform_error(_Fun&& __fun) &
   {
     static_assert(invocable<_Fun, _Err&>, "std::expected::transform_error requires that F must be invocable with E");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, _Err&>>;
@@ -1007,7 +995,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_copy_constructible, _Tp2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform_error(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform_error(_Fun&& __fun) const&
   {
     static_assert(invocable<_Fun, const _Err&>,
                   "std::expected::transform_error requires that F must be invocable with E");
@@ -1033,7 +1021,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_move_constructible, _Tp2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform_error(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform_error(_Fun&& __fun) &&
   {
     static_assert(invocable<_Fun, _Err>, "std::expected::transform_error requires that F must be invocable with E");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, _Err>>;
@@ -1061,7 +1049,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Tp2, const _Tp2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform_error(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform_error(_Fun&& __fun) const&&
   {
     static_assert(invocable<_Fun, const _Err>,
                   "std::expected::transform_error requires that F must be invocable with E");
@@ -1089,8 +1077,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
   // [expected.object.eq], equality operators
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const expected& __x, const expected& __y)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y)
   {
     if (__x.__has_val_ != __y.has_value())
     {
@@ -1110,8 +1097,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
 #  if _CCCL_STD_VER < 2020
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator!=(const expected& __x, const expected& __y)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y)
   {
     return !(__x == __y);
   }
@@ -1119,8 +1105,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _T2, class _E2)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_void, _T2)))
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const expected& __x, const expected<_T2, _E2>& __y)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y)
   {
     if (__x.__has_val_ != __y.has_value())
     {
@@ -1142,8 +1127,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
 #  if _CCCL_STD_VER < 2020
   _LIBCUDACXX_TEMPLATE(class _T2, class _E2)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_void, _T2)))
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator!=(const expected& __x, const expected<_T2, _E2>& __y)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected<_T2, _E2>& __y)
   {
     return !(__x == __y);
   }
@@ -1151,57 +1135,49 @@ class expected : private __expected_move_assign<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _T2)
   _LIBCUDACXX_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const expected& __x, const _T2& __v)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const _T2& __v)
   {
     return __x.__has_val_ && static_cast<bool>(__x.__union_.__val_ == __v);
   }
 #  if _CCCL_STD_VER < 2020
   _LIBCUDACXX_TEMPLATE(class _T2)
   _LIBCUDACXX_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const _T2& __v, const expected& __x)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const _T2& __v, const expected& __x)
   {
     return __x.__has_val_ && static_cast<bool>(__x.__union_.__val_ == __v);
   }
   _LIBCUDACXX_TEMPLATE(class _T2)
   _LIBCUDACXX_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator!=(const expected& __x, const _T2& __v)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const _T2& __v)
   {
     return !__x.__has_val_ || static_cast<bool>(__x.__union_.__val_ != __v);
   }
   _LIBCUDACXX_TEMPLATE(class _T2)
   _LIBCUDACXX_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator!=(const _T2& __v, const expected& __x)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const _T2& __v, const expected& __x)
   {
     return !__x.__has_val_ || static_cast<bool>(__x.__union_.__val_ != __v);
   }
 #  endif // _CCCL_STD_VER < 2020
 
   template <class _E2>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const expected& __x, const unexpected<_E2>& __e)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __e)
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __e.error());
   }
 #  if _CCCL_STD_VER < 2020
   template <class _E2>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const unexpected<_E2>& __e, const expected& __x)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __e, const expected& __x)
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __e.error());
   }
   template <class _E2>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator!=(const expected& __x, const unexpected<_E2>& __e)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __e)
   {
     return __x.__has_val_ || static_cast<bool>(__x.__union_.__unex_ != __e.error());
   }
   template <class _E2>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator!=(const unexpected<_E2>& __e, const expected& __x)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const unexpected<_E2>& __e, const expected& __x)
   {
     return __x.__has_val_ || static_cast<bool>(__x.__union_.__unex_ != __e.error());
   }
@@ -1237,17 +1213,16 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   using rebind = expected<_Up, error_type>;
 
   // [expected.void.ctor], constructors
-  constexpr expected()                           = default;
-  constexpr expected(const expected&)            = default;
-  constexpr expected(expected&&)                 = default;
-  constexpr expected& operator=(const expected&) = default;
-  constexpr expected& operator=(expected&&)      = default;
+  _CCCL_HIDE_FROM_ABI constexpr expected()                           = default;
+  _CCCL_HIDE_FROM_ABI constexpr expected(const expected&)            = default;
+  _CCCL_HIDE_FROM_ABI constexpr expected(expected&&)                 = default;
+  _CCCL_HIDE_FROM_ABI constexpr expected& operator=(const expected&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr expected& operator=(expected&&)      = default;
 
   _LIBCUDACXX_TEMPLATE(class _Up, class _OtherErr)
   _LIBCUDACXX_REQUIRES(__can_convert<_Up, _OtherErr, const _OtherErr&>::value _LIBCUDACXX_AND _CCCL_TRAIT(
     is_convertible, const _OtherErr&, _Err))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  expected(const expected<_Up, _OtherErr>& __other) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 expected(const expected<_Up, _OtherErr>& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, const _OtherErr&)) // strengthened
       : __base(__other.__has_val_)
   {
@@ -1260,8 +1235,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Up, class _OtherErr)
   _LIBCUDACXX_REQUIRES(__can_convert<_Up, _OtherErr, const _OtherErr&>::value _LIBCUDACXX_AND(
     !_CCCL_TRAIT(is_convertible, const _OtherErr&, _Err)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX20 explicit expected(const expected<_Up, _OtherErr>& __other) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit expected(const expected<_Up, _OtherErr>& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, const _OtherErr&)) // strengthened
       : __base(__other.__has_val_)
   {
@@ -1274,8 +1248,8 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Up, class _OtherErr)
   _LIBCUDACXX_REQUIRES(
     __can_convert<_Up, _OtherErr, _OtherErr>::value _LIBCUDACXX_AND _CCCL_TRAIT(is_convertible, _OtherErr, _Err))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 expected(
-    expected<_Up, _OtherErr>&& __other) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 expected(expected<_Up, _OtherErr>&& __other) noexcept(
+    _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
       : __base(__other.__has_val_)
   {
     if (!__other.__has_val_)
@@ -1287,8 +1261,8 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Up, class _OtherErr)
   _LIBCUDACXX_REQUIRES(
     __can_convert<_Up, _OtherErr, _OtherErr>::value _LIBCUDACXX_AND(!_CCCL_TRAIT(is_convertible, _OtherErr, _Err)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 explicit expected(
-    expected<_Up, _OtherErr>&& __other) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit expected(expected<_Up, _OtherErr>&& __other) noexcept(
+    _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
       : __base(__other.__has_val_)
   {
     if (!__other.__has_val_)
@@ -1300,8 +1274,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, const _OtherErr&)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_convertible, const _OtherErr&, _Err))
-  _LIBCUDACXX_HIDE_FROM_ABI
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr expected(const unexpected<_OtherErr>& __unex) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr expected(const unexpected<_OtherErr>& __unex) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, const _OtherErr&)) // strengthened
       : __base(unexpect, __unex.error())
   {}
@@ -1309,8 +1282,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, const _OtherErr&)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_convertible, const _OtherErr&, _Err)))
-  _LIBCUDACXX_HIDE_FROM_ABI
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(const unexpected<_OtherErr>& __unex) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(const unexpected<_OtherErr>& __unex) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, const _OtherErr&)) // strengthened
       : __base(unexpect, __unex.error())
   {}
@@ -1318,7 +1290,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _OtherErr)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_convertible, _OtherErr, _Err))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr expected(unexpected<_OtherErr>&& __unex) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr expected(unexpected<_OtherErr>&& __unex) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
       : __base(unexpect, _CUDA_VSTD::move(__unex.error()))
   {}
@@ -1326,25 +1298,25 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _OtherErr)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_convertible, _OtherErr, _Err)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(
-    unexpected<_OtherErr>&& __unex) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(unexpected<_OtherErr>&& __unex) noexcept(
+    _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr)) // strengthened
       : __base(unexpect, _CUDA_VSTD::move(__unex.error()))
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(in_place_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(in_place_t) noexcept
       : __base(true)
   {}
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _Args...))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(
-    unexpect_t, _Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...)) // strengthened
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(unexpect_t, _Args&&... __args) noexcept(
+    _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...)) // strengthened
       : __base(unexpect, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Up, class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, initializer_list<_Up>&, _Args...))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit expected(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit expected(
     unexpect_t,
     initializer_list<_Up> __il,
     _Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible,
@@ -1356,7 +1328,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
 
 private:
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr expected(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr expected(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -1373,8 +1345,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, const _OtherErr&)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_assignable, _Err&, const _OtherErr&))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 expected&
-  operator=(const unexpected<_OtherErr>& __un) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 expected& operator=(const unexpected<_OtherErr>& __un) noexcept(
     _CCCL_TRAIT(is_nothrow_assignable, _Err&, const _OtherErr&)
     && _CCCL_TRAIT(is_nothrow_constructible, _Err, const _OtherErr&)) // strengthened
   {
@@ -1393,9 +1364,8 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _OtherErr)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _OtherErr)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_assignable, _Err&, _OtherErr))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 expected&
-  operator=(unexpected<_OtherErr>&& __un) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _Err&, _OtherErr)
-                                                   && _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr))
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 expected& operator=(unexpected<_OtherErr>&& __un) noexcept(
+    _CCCL_TRAIT(is_nothrow_assignable, _Err&, _OtherErr) && _CCCL_TRAIT(is_nothrow_constructible, _Err, _OtherErr))
   {
     if (this->__has_val_)
     {
@@ -1409,7 +1379,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr void emplace() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void emplace() noexcept
   {
     if (!this->__has_val_)
     {
@@ -1421,9 +1391,8 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   // [expected.void.swap], swap
   _LIBCUDACXX_TEMPLATE(class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(__expected::__can_swap<void, _Err2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
-  swap(expected<void, _Err2>& __rhs) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Err2)
-                                              && _CCCL_TRAIT(is_nothrow_swappable, _Err2))
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void swap(expected<void, _Err2>& __rhs) noexcept(
+    _CCCL_TRAIT(is_nothrow_move_constructible, _Err2) && _CCCL_TRAIT(is_nothrow_swappable, _Err2))
   {
     if (this->__has_val_)
     {
@@ -1447,31 +1416,30 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   template <class _Err2 = _Err>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 auto
-  swap(expected& __x, expected& __y) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Err2)
-                                              && _CCCL_TRAIT(is_nothrow_swappable, _Err2))
+  friend _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 auto swap(expected& __x, expected& __y) noexcept(
+    _CCCL_TRAIT(is_nothrow_move_constructible, _Err2) && _CCCL_TRAIT(is_nothrow_swappable, _Err2))
     _LIBCUDACXX_TRAILING_REQUIRES(void)(__expected::__can_swap<void, _Err2>)
   {
     return __x.swap(__y); // some compiler warn about non void function without return
   }
 
   // [expected.void.obs], observers
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit operator bool() const noexcept
   {
     return this->__has_val_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool has_value() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool has_value() const noexcept
   {
     return this->__has_val_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator*() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator*() const noexcept
   {
     _LIBCUDACXX_ASSERT(this->__has_val_, "expected::operator* requires the expected to contain a value");
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr void value() const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void value() const&
   {
     static_assert(_CCCL_TRAIT(is_copy_constructible, _Err),
                   "expected::value() const& requires is_copy_constructible_v<E>");
@@ -1481,7 +1449,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr void value() &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void value() &&
   {
     static_assert(_CCCL_TRAIT(is_copy_constructible, _Err), "expected::value() && requires is_copy_constructible_v<E>");
     static_assert(_CCCL_TRAIT(is_move_constructible, _Err), "expected::value() && requires is_move_constructible_v<E>");
@@ -1491,25 +1459,25 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Err& error() const& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Err& error() const& noexcept
   {
     _LIBCUDACXX_ASSERT(!this->__has_val_, "expected::error requires the expected to contain an error");
     return this->__union_.__unex_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Err& error() & noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Err& error() & noexcept
   {
     _LIBCUDACXX_ASSERT(!this->__has_val_, "expected::error requires the expected to contain an error");
     return this->__union_.__unex_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Err&& error() const&& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Err&& error() const&& noexcept
   {
     _LIBCUDACXX_ASSERT(!this->__has_val_, "expected::error requires the expected to contain an error");
     return _CUDA_VSTD::move(this->__union_.__unex_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Err&& error() && noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Err&& error() && noexcept
   {
     _LIBCUDACXX_ASSERT(!this->__has_val_, "expected::error requires the expected to contain an error");
     return _CUDA_VSTD::move(this->__union_.__unex_);
@@ -1518,7 +1486,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   // [expected.void.monadic]
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, _Err2&))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto and_then(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto and_then(_Fun&& __fun) &
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun>>;
 
@@ -1538,7 +1506,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_copy_constructible, _Err2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto and_then(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto and_then(_Fun&& __fun) const&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun>>;
 
@@ -1558,7 +1526,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_move_constructible, _Err2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto and_then(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto and_then(_Fun&& __fun) &&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun>>;
 
@@ -1578,7 +1546,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, const _Err2))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto and_then(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto and_then(_Fun&& __fun) const&&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun>>;
 
@@ -1597,7 +1565,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   template <class _Fun>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto or_else(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto or_else(_Fun&& __fun) &
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, _Err&>>;
 
@@ -1617,7 +1585,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   template <class _Fun>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto or_else(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto or_else(_Fun&& __fun) const&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, const _Err&>>;
 
@@ -1637,7 +1605,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   template <class _Fun>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto or_else(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto or_else(_Fun&& __fun) &&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, _Err>>;
 
@@ -1657,7 +1625,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   template <class _Fun>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto or_else(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto or_else(_Fun&& __fun) const&&
   {
     using _Res = __remove_cvref_t<invoke_result_t<_Fun, const _Err>>;
 
@@ -1679,7 +1647,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, _Err2&)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun>>, void))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) &
   {
     static_assert(invocable<_Fun>, "std::expected::transform requires that F must be invocable with T.");
     if (this->__has_val_)
@@ -1696,7 +1664,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, _Err2&)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun>>, void)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) &
   {
     static_assert(invocable<_Fun>, "std::expected::transform requires that F must be invocable with T.");
     using _Res = __remove_cv_t<invoke_result_t<_Fun>>;
@@ -1721,7 +1689,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_copy_constructible, _Err2)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun>>, void))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) const&
   {
     static_assert(invocable<_Fun>, "std::expected::transform requires that F must be invocable with T.");
     if (this->__has_val_)
@@ -1738,7 +1706,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_copy_constructible, _Err2)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun>>, void)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) const&
   {
     static_assert(invocable<_Fun>, "std::expected::transform requires that F must be invocable with T");
     using _Res = __remove_cv_t<invoke_result_t<_Fun>>;
@@ -1763,7 +1731,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_move_constructible, _Err2)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun>>, void))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) &&
   {
     static_assert(invocable<_Fun>, "std::expected::transform requires that F must be invocable with T.");
     if (this->__has_val_)
@@ -1779,7 +1747,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_move_constructible, _Err2)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun>>, void)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) &&
   {
     static_assert(invocable<_Fun>, "std::expected::transform requires that F must be invocable with T");
     using _Res = __remove_cv_t<invoke_result_t<_Fun>>;
@@ -1804,7 +1772,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, const _Err2)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun>>, void))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) const&&
   {
     static_assert(invocable<_Fun>, "std::expected::transform requires that F must be invocable with T.");
     if (this->__has_val_)
@@ -1821,7 +1789,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   _LIBCUDACXX_TEMPLATE(class _Fun, class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err2, const _Err2)
                          _LIBCUDACXX_AND(!_CCCL_TRAIT(is_same, __remove_cv_t<invoke_result_t<_Fun>>, void)))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform(_Fun&& __fun) const&&
   {
     static_assert(invocable<_Fun>, "std::expected::transform requires that F must be invocable with T");
     using _Res = __remove_cv_t<invoke_result_t<_Fun>>;
@@ -1844,7 +1812,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   template <class _Fun>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform_error(_Fun&& __fun) &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform_error(_Fun&& __fun) &
   {
     static_assert(invocable<_Fun, _Err&>, "std::expected::transform_error requires that F must be invocable with E");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, _Err&>>;
@@ -1868,7 +1836,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   template <class _Fun>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform_error(_Fun&& __fun) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform_error(_Fun&& __fun) const&
   {
     static_assert(invocable<_Fun, const _Err&>,
                   "std::expected::transform_error requires that F must be invocable with E");
@@ -1893,7 +1861,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   template <class _Fun>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform_error(_Fun&& __fun) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform_error(_Fun&& __fun) &&
   {
     static_assert(invocable<_Fun, _Err>, "std::expected::transform_error requires that F must be invocable with E");
     using _Res = __remove_cv_t<invoke_result_t<_Fun, _Err>>;
@@ -1920,7 +1888,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   template <class _Fun>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto transform_error(_Fun&& __fun) const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto transform_error(_Fun&& __fun) const&&
   {
     static_assert(invocable<_Fun, const _Err>,
                   "std::expected::transform_error requires that F must be invocable with E");
@@ -1948,8 +1916,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   // [expected.void.eq], equality operators
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const expected& __x, const expected& __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y) noexcept
   {
     if (__x.__has_val_ != __y.has_value())
     {
@@ -1961,15 +1928,14 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
     }
   }
 #  if _CCCL_STD_VER < 2020
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator!=(const expected& __x, const expected& __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y) noexcept
   {
     return !(__x == __y);
   }
 #  endif
 
   template <class _E2>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator==(const expected& __x, const expected<void, _E2>& __y) noexcept
   {
     if (__x.__has_val_ != __y.has_value())
@@ -1983,7 +1949,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 #  if _CCCL_STD_VER < 2020
   template <class _E2>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator!=(const expected& __x, const expected<void, _E2>& __y) noexcept
   {
     return !(__x == __y);
@@ -1991,27 +1957,23 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
 #  endif
 
   template <class _E2>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const expected& __x, const unexpected<_E2>& __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __y) noexcept
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __y.error());
   }
 #  if _CCCL_STD_VER < 2020
   template <class _E2>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const unexpected<_E2>& __y, const expected& __x) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __y, const expected& __x) noexcept
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __y.error());
   }
   template <class _E2>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr bool
-  operator!=(const expected& __x, const unexpected<_E2>& __y) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __y) noexcept
   {
     return __x.__has_val_ || static_cast<bool>(__x.__union_.__unex_ != __y.error());
   }
   template <class _E2>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr bool
-  operator!=(const unexpected<_E2>& __y, const expected& __x) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const unexpected<_E2>& __y, const expected& __x) noexcept
   {
     return __x.__has_val_ || static_cast<bool>(__x.__union_.__unex_ != __y.error());
   }
diff --git a/libcudacxx/include/cuda/std/__expected/expected_base.h b/libcudacxx/include/cuda/std/__expected/expected_base.h
index 8343f56647..be5fd87afb 100644
--- a/libcudacxx/include/cuda/std/__expected/expected_base.h
+++ b/libcudacxx/include/cuda/std/__expected/expected_base.h
@@ -61,7 +61,7 @@ _CCCL_DIAG_SUPPRESS_MSVC(4848)
 
 struct __expected_construct_from_invoke_tag
 {
-  explicit __expected_construct_from_invoke_tag() = default;
+  _CCCL_HIDE_FROM_ABI explicit __expected_construct_from_invoke_tag() = default;
 };
 
 template <class _Tp,
@@ -74,31 +74,30 @@ union __expected_union_t
 
   _LIBCUDACXX_TEMPLATE(class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_default_constructible, _Tp2))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t() noexcept(
-    _CCCL_TRAIT(is_nothrow_default_constructible, _Tp2))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2))
       : __val_()
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_default_constructible, _Tp2)))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept
       : __empty_()
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
       : __val_(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __unex_(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
     __expected_construct_from_invoke_tag,
     in_place_t,
     _Fun&& __fun,
@@ -107,7 +106,7 @@ union __expected_union_t
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -117,7 +116,7 @@ union __expected_union_t
 
   // the __expected_destruct's destructor handles this
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__expected_union_t() {}
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__expected_union_t() {}
 
   _CCCL_NO_UNIQUE_ADDRESS __empty_t __empty_;
   _CCCL_NO_UNIQUE_ADDRESS _Tp __val_;
@@ -132,31 +131,30 @@ union __expected_union_t<_Tp, _Err, true>
 
   _LIBCUDACXX_TEMPLATE(class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_default_constructible, _Tp2))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t() noexcept(
-    _CCCL_TRAIT(is_nothrow_default_constructible, _Tp2))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2))
       : __val_()
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_default_constructible, _Tp2)))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept
       : __empty_()
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
       : __val_(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __unex_(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
     __expected_construct_from_invoke_tag,
     in_place_t,
     _Fun&& __fun,
@@ -165,7 +163,7 @@ union __expected_union_t<_Tp, _Err, true>
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -190,29 +188,29 @@ struct __expected_destruct<_Tp, _Err, false, false>
   _CCCL_NO_UNIQUE_ADDRESS __expected_union_t<_Tp, _Err> __union_{};
   bool __has_val_{true};
 
-  constexpr __expected_destruct() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __expected_destruct() = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(const bool __has_val) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(const bool __has_val) noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
       : __has_val_(__has_val)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(in_place_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(in_place_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
       : __union_(in_place, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(true)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __union_(unexpect, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(false)
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     in_place_t,
     _Fun&& __fun,
@@ -225,7 +223,7 @@ struct __expected_destruct<_Tp, _Err, false, false>
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -238,7 +236,7 @@ struct __expected_destruct<_Tp, _Err, false, false>
   {}
 
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__expected_destruct()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__expected_destruct()
   {
     if (__has_val_)
     {
@@ -257,29 +255,29 @@ struct __expected_destruct<_Tp, _Err, true, false>
   _CCCL_NO_UNIQUE_ADDRESS __expected_union_t<_Tp, _Err> __union_{};
   bool __has_val_{true};
 
-  constexpr __expected_destruct() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __expected_destruct() = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(const bool __has_val) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(const bool __has_val) noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
       : __has_val_(__has_val)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(in_place_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(in_place_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
       : __union_(in_place, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(true)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __union_(unexpect, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(false)
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     in_place_t,
     _Fun&& __fun,
@@ -292,7 +290,7 @@ struct __expected_destruct<_Tp, _Err, true, false>
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -305,7 +303,7 @@ struct __expected_destruct<_Tp, _Err, true, false>
   {}
 
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__expected_destruct()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__expected_destruct()
   {
     if (!__has_val_)
     {
@@ -320,29 +318,29 @@ struct __expected_destruct<_Tp, _Err, false, true>
   _CCCL_NO_UNIQUE_ADDRESS __expected_union_t<_Tp, _Err> __union_{};
   bool __has_val_{true};
 
-  constexpr __expected_destruct() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __expected_destruct() = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(const bool __has_val) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(const bool __has_val) noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
       : __has_val_(__has_val)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(in_place_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(in_place_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
       : __union_(in_place, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(true)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __union_(unexpect, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(false)
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     in_place_t,
     _Fun&& __fun,
@@ -355,7 +353,7 @@ struct __expected_destruct<_Tp, _Err, false, true>
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -368,7 +366,7 @@ struct __expected_destruct<_Tp, _Err, false, true>
   {}
 
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__expected_destruct()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__expected_destruct()
   {
     if (__has_val_)
     {
@@ -384,29 +382,29 @@ struct __expected_destruct<_Tp, _Err, true, true>
   /* _CCCL_NO_UNIQUE_ADDRESS */ __expected_union_t<_Tp, _Err> __union_{};
   bool __has_val_{true};
 
-  constexpr __expected_destruct() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __expected_destruct() = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(const bool __has_val) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(const bool __has_val) noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
       : __has_val_(__has_val)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(in_place_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(in_place_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
       : __union_(in_place, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(true)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __union_(unexpect, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(false)
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     in_place_t,
     _Fun&& __fun,
@@ -419,7 +417,7 @@ struct __expected_destruct<_Tp, _Err, true, true>
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -441,7 +439,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _T1, class _T2, class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...))
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void
   __reinit_expected(_T1& __newval, _T2& __oldval, _Args&&... __args) noexcept
   {
     _CUDA_VSTD::__destroy_at(_CUDA_VSTD::addressof(__oldval));
@@ -451,7 +449,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _T1, class _T2, class... _Args)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) _LIBCUDACXX_AND _CCCL_TRAIT(
     is_nothrow_move_constructible, _T1))
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void
   __reinit_expected(_T1& __newval, _T2& __oldval, _Args&&... __args)
   {
     _T1 __tmp(_CUDA_VSTD::forward<_Args>(__args)...);
@@ -462,7 +460,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
   _LIBCUDACXX_TEMPLATE(class _T1, class _T2, class... _Args)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) _LIBCUDACXX_AND(
     !_CCCL_TRAIT(is_nothrow_move_constructible, _T1)))
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void
   __reinit_expected(_T1& __newval, _T2& __oldval, _Args&&... __args)
   {
     static_assert(
@@ -480,7 +478,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_nothrow_move_constructible, _Err2))
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void
   __swap_val_unex_impl(__expected_storage<_Tp, _Err2>& __with_val, __expected_storage& __with_err)
   {
     _Err __tmp(_CUDA_VSTD::move(__with_err.__union_.__unex_));
@@ -498,7 +496,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
 
   _LIBCUDACXX_TEMPLATE(class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_nothrow_move_constructible, _Err2)))
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void
   __swap_val_unex_impl(__expected_storage<_Tp, _Err2>& __with_val, __expected_storage& __with_err)
   {
     static_assert(_CCCL_TRAIT(is_nothrow_move_constructible, _Tp),
@@ -539,7 +537,7 @@ struct __expected_copy<_Tp, _Err, __smf_availability::__available> : __expected_
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_copy, __expected_storage, _Tp, _Err);
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __expected_copy(const __expected_copy& __other) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __expected_copy(const __expected_copy& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp) && _CCCL_TRAIT(is_nothrow_copy_constructible, _Err))
       : __base(__other.__has_val_)
   {
@@ -553,9 +551,9 @@ struct __expected_copy<_Tp, _Err, __smf_availability::__available> : __expected_
     }
   }
 
-  __expected_copy(__expected_copy&&)                 = default;
-  __expected_copy& operator=(const __expected_copy&) = default;
-  __expected_copy& operator=(__expected_copy&&)      = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy(__expected_copy&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy& operator=(const __expected_copy&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy& operator=(__expected_copy&&)      = default;
 };
 
 template <class _Tp, class _Err>
@@ -563,10 +561,10 @@ struct __expected_copy<_Tp, _Err, __smf_availability::__deleted> : __expected_st
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_copy, __expected_storage, _Tp, _Err);
 
-  __expected_copy(const __expected_copy&)            = delete;
-  __expected_copy(__expected_copy&&)                 = default;
-  __expected_copy& operator=(const __expected_copy&) = default;
-  __expected_copy& operator=(__expected_copy&&)      = default;
+  __expected_copy(const __expected_copy&)                                = delete;
+  _CCCL_HIDE_FROM_ABI __expected_copy(__expected_copy&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy& operator=(const __expected_copy&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy& operator=(__expected_copy&&)      = default;
 };
 
 template <class _Tp, class _Err>
@@ -590,9 +588,9 @@ struct __expected_move<_Tp, _Err, __smf_availability::__available> : __expected_
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_move, __expected_copy, _Tp, _Err);
 
-  __expected_move(const __expected_move&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_move(const __expected_move&) = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __expected_move(__expected_move&& __other) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __expected_move(__expected_move&& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_move_constructible, _Tp) && _CCCL_TRAIT(is_nothrow_move_constructible, _Err))
       : __base(__other.__has_val_)
   {
@@ -606,8 +604,8 @@ struct __expected_move<_Tp, _Err, __smf_availability::__available> : __expected_
     }
   }
 
-  __expected_move& operator=(const __expected_move&) = default;
-  __expected_move& operator=(__expected_move&&)      = default;
+  _CCCL_HIDE_FROM_ABI __expected_move& operator=(const __expected_move&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_move& operator=(__expected_move&&)      = default;
 };
 
 template <class _Tp, class _Err>
@@ -615,10 +613,10 @@ struct __expected_move<_Tp, _Err, __smf_availability::__deleted> : __expected_co
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_move, __expected_copy, _Tp, _Err);
 
-  __expected_move(const __expected_move&)            = default;
-  __expected_move(__expected_move&&)                 = delete;
-  __expected_move& operator=(const __expected_move&) = default;
-  __expected_move& operator=(__expected_move&&)      = default;
+  _CCCL_HIDE_FROM_ABI __expected_move(const __expected_move&)            = default;
+  __expected_move(__expected_move&&)                                     = delete;
+  _CCCL_HIDE_FROM_ABI __expected_move& operator=(const __expected_move&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_move& operator=(__expected_move&&)      = default;
 };
 
 // Need to also check against is_nothrow_move_constructible in the trivial case as that is stupidly in the constraints
@@ -653,10 +651,10 @@ struct __expected_copy_assign<_Tp, _Err, __smf_availability::__available> : __ex
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_copy_assign, __expected_move, _Tp, _Err);
 
-  __expected_copy_assign(const __expected_copy_assign&) = default;
-  __expected_copy_assign(__expected_copy_assign&&)      = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy_assign(const __expected_copy_assign&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy_assign(__expected_copy_assign&&)      = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __expected_copy_assign&
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __expected_copy_assign&
   operator=(const __expected_copy_assign& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_copy_assignable, _Tp) && _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp)
     && _CCCL_TRAIT(is_nothrow_copy_assignable, _Err)
@@ -683,7 +681,7 @@ struct __expected_copy_assign<_Tp, _Err, __smf_availability::__available> : __ex
     return *this;
   }
 
-  __expected_copy_assign& operator=(__expected_copy_assign&&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy_assign& operator=(__expected_copy_assign&&) = default;
 };
 
 template <class _Tp, class _Err>
@@ -691,10 +689,10 @@ struct __expected_copy_assign<_Tp, _Err, __smf_availability::__deleted> : __expe
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_copy_assign, __expected_move, _Tp, _Err);
 
-  __expected_copy_assign(const __expected_copy_assign&)            = default;
-  __expected_copy_assign(__expected_copy_assign&&)                 = default;
-  __expected_copy_assign& operator=(const __expected_copy_assign&) = delete;
-  __expected_copy_assign& operator=(__expected_copy_assign&&)      = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy_assign(const __expected_copy_assign&)       = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy_assign(__expected_copy_assign&&)            = default;
+  __expected_copy_assign& operator=(const __expected_copy_assign&)                = delete;
+  _CCCL_HIDE_FROM_ABI __expected_copy_assign& operator=(__expected_copy_assign&&) = default;
 };
 
 template <class _Tp, class _Err>
@@ -726,11 +724,11 @@ struct __expected_move_assign<_Tp, _Err, __smf_availability::__available> : __ex
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_move_assign, __expected_copy_assign, _Tp, _Err);
 
-  __expected_move_assign(const __expected_move_assign&)            = default;
-  __expected_move_assign(__expected_move_assign&&)                 = default;
-  __expected_move_assign& operator=(const __expected_move_assign&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_move_assign(const __expected_move_assign&)            = default;
+  _CCCL_HIDE_FROM_ABI __expected_move_assign(__expected_move_assign&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __expected_move_assign& operator=(const __expected_move_assign&) = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __expected_move_assign&
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __expected_move_assign&
   operator=(__expected_move_assign&& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_move_assignable, _Tp) && _CCCL_TRAIT(is_nothrow_move_constructible, _Tp)
     && _CCCL_TRAIT(is_nothrow_move_assignable, _Err)
@@ -763,10 +761,10 @@ struct __expected_move_assign<_Tp, _Err, __smf_availability::__deleted> : __expe
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_move_assign, __expected_copy_assign, _Tp, _Err);
 
-  __expected_move_assign(const __expected_move_assign&)            = default;
-  __expected_move_assign(__expected_move_assign&&)                 = default;
-  __expected_move_assign& operator=(const __expected_move_assign&) = default;
-  __expected_move_assign& operator=(__expected_move_assign&&)      = delete;
+  _CCCL_HIDE_FROM_ABI __expected_move_assign(const __expected_move_assign&)            = default;
+  _CCCL_HIDE_FROM_ABI __expected_move_assign(__expected_move_assign&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __expected_move_assign& operator=(const __expected_move_assign&) = default;
+  __expected_move_assign& operator=(__expected_move_assign&&)                          = delete;
 };
 
 // expected<void, E> base classtemplate <class _Tp, class _Err>
@@ -782,18 +780,18 @@ struct __expected_destruct<void, _Err, false, false>
     struct __empty_t
     {};
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t() noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept
         : __empty_()
     {}
 
     template <class... _Args>
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept(
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept(
       _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
         : __unex_(_CUDA_VSTD::forward<_Args>(__args)...)
     {}
 
     template <class _Fun, class... _Args>
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
       __expected_construct_from_invoke_tag,
       unexpect_t,
       _Fun&& __fun,
@@ -803,28 +801,28 @@ struct __expected_destruct<void, _Err, false, false>
 
     // the __expected_destruct's destructor handles this
     _CCCL_EXEC_CHECK_DISABLE
-    _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__expected_union_t() {}
+    _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__expected_union_t() {}
 
     _CCCL_NO_UNIQUE_ADDRESS __empty_t __empty_;
     _CCCL_NO_UNIQUE_ADDRESS _Err __unex_;
   } __union_{};
   bool __has_val_{true};
 
-  constexpr __expected_destruct() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __expected_destruct() = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(const bool __has_val) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(const bool __has_val) noexcept
       : __has_val_(__has_val)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __union_(unexpect, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(false)
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -837,7 +835,7 @@ struct __expected_destruct<void, _Err, false, false>
   {}
 
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__expected_destruct()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__expected_destruct()
   {
     if (!__has_val_)
     {
@@ -855,18 +853,18 @@ struct __expected_destruct<void, _Err, false, true>
     struct __empty_t
     {};
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t() noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept
         : __empty_()
     {}
 
     template <class... _Args>
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept(
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept(
       _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
         : __unex_(_CUDA_VSTD::forward<_Args>(__args)...)
     {}
 
     template <class _Fun, class... _Args>
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_union_t(
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
       __expected_construct_from_invoke_tag,
       unexpect_t,
       _Fun&& __fun,
@@ -879,24 +877,24 @@ struct __expected_destruct<void, _Err, false, true>
   } __union_{};
   bool __has_val_{true};
 
-  constexpr __expected_destruct() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __expected_destruct() = default;
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(in_place_t) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(in_place_t) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __union_()
       , __has_val_(true)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(unexpect_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __union_(unexpect, _CUDA_VSTD::forward<_Args>(__args)...)
       , __has_val_(false)
   {}
 
   template <class _Fun, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(
     __expected_construct_from_invoke_tag,
     unexpect_t,
     _Fun&& __fun,
@@ -908,7 +906,7 @@ struct __expected_destruct<void, _Err, false, true>
       , __has_val_(false)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __expected_destruct(const bool __has_val) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_destruct(const bool __has_val) noexcept
       : __has_val_(__has_val)
   {}
 };
@@ -920,7 +918,7 @@ struct __expected_storage<void, _Err> : __expected_destruct<void, _Err>
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_storage, __expected_destruct, void, _Err);
 
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void __swap_val_unex_impl(
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __swap_val_unex_impl(
     __expected_storage& __with_val,
     __expected_storage& __with_err) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Err))
   {
@@ -936,7 +934,7 @@ struct __expected_copy<void, _Err, __smf_availability::__available> : __expected
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_copy, __expected_storage, void, _Err);
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20
   __expected_copy(const __expected_copy& __other) noexcept(_CCCL_TRAIT(is_nothrow_copy_constructible, _Err))
       : __base(__other.__has_val_)
   {
@@ -946,9 +944,9 @@ struct __expected_copy<void, _Err, __smf_availability::__available> : __expected
     }
   }
 
-  __expected_copy(__expected_copy&&)                 = default;
-  __expected_copy& operator=(const __expected_copy&) = default;
-  __expected_copy& operator=(__expected_copy&&)      = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy(__expected_copy&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy& operator=(const __expected_copy&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy& operator=(__expected_copy&&)      = default;
 };
 
 template <class _Err>
@@ -956,9 +954,9 @@ struct __expected_move<void, _Err, __smf_availability::__available> : __expected
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_move, __expected_copy, void, _Err);
 
-  __expected_move(const __expected_move&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_move(const __expected_move&) = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20
   __expected_move(__expected_move&& __other) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Err))
       : __base(__other.__has_val_)
   {
@@ -968,8 +966,8 @@ struct __expected_move<void, _Err, __smf_availability::__available> : __expected
     }
   }
 
-  __expected_move& operator=(const __expected_move&) = default;
-  __expected_move& operator=(__expected_move&&)      = default;
+  _CCCL_HIDE_FROM_ABI __expected_move& operator=(const __expected_move&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_move& operator=(__expected_move&&)      = default;
 };
 
 template <class _Err>
@@ -977,10 +975,10 @@ struct __expected_copy_assign<void, _Err, __smf_availability::__available> : __e
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_copy_assign, __expected_move, void, _Err);
 
-  __expected_copy_assign(const __expected_copy_assign&) = default;
-  __expected_copy_assign(__expected_copy_assign&&)      = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy_assign(const __expected_copy_assign&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy_assign(__expected_copy_assign&&)      = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __expected_copy_assign&
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __expected_copy_assign&
   operator=(const __expected_copy_assign& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_copy_assignable, _Err) && _CCCL_TRAIT(is_nothrow_copy_constructible, _Err)) // strengthened
   {
@@ -1005,7 +1003,7 @@ struct __expected_copy_assign<void, _Err, __smf_availability::__available> : __e
     return *this;
   }
 
-  __expected_copy_assign& operator=(__expected_copy_assign&&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_copy_assign& operator=(__expected_copy_assign&&) = default;
 };
 
 template <class _Err>
@@ -1013,11 +1011,11 @@ struct __expected_move_assign<void, _Err, __smf_availability::__available> : __e
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_move_assign, __expected_copy_assign, void, _Err);
 
-  __expected_move_assign(const __expected_move_assign&)            = default;
-  __expected_move_assign(__expected_move_assign&&)                 = default;
-  __expected_move_assign& operator=(const __expected_move_assign&) = default;
+  _CCCL_HIDE_FROM_ABI __expected_move_assign(const __expected_move_assign&)            = default;
+  _CCCL_HIDE_FROM_ABI __expected_move_assign(__expected_move_assign&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __expected_move_assign& operator=(const __expected_move_assign&) = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __expected_move_assign&
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __expected_move_assign&
   operator=(__expected_move_assign&& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_move_assignable, _Err) && _CCCL_TRAIT(is_nothrow_move_constructible, _Err)) // strengthened
   {
diff --git a/libcudacxx/include/cuda/std/__expected/unexpect.h b/libcudacxx/include/cuda/std/__expected/unexpect.h
index fcca0c777b..63b61ce959 100644
--- a/libcudacxx/include/cuda/std/__expected/unexpect.h
+++ b/libcudacxx/include/cuda/std/__expected/unexpect.h
@@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 struct unexpect_t
 {
-  explicit unexpect_t() = default;
+  _CCCL_HIDE_FROM_ABI explicit unexpect_t() = default;
 };
 
 _CCCL_GLOBAL_CONSTANT unexpect_t unexpect{};
diff --git a/libcudacxx/include/cuda/std/__expected/unexpected.h b/libcudacxx/include/cuda/std/__expected/unexpected.h
index 6105eab58a..950ab81efb 100644
--- a/libcudacxx/include/cuda/std/__expected/unexpected.h
+++ b/libcudacxx/include/cuda/std/__expected/unexpected.h
@@ -70,61 +70,60 @@ class unexpected
 
 public:
   // [expected.un.ctor]
-  _LIBCUDACXX_HIDE_FROM_ABI unexpected(const unexpected&) = default;
-  _LIBCUDACXX_HIDE_FROM_ABI unexpected(unexpected&&)      = default;
+  _CCCL_HIDE_FROM_ABI unexpected(const unexpected&) = default;
+  _CCCL_HIDE_FROM_ABI unexpected(unexpected&&)      = default;
 
   _LIBCUDACXX_TEMPLATE(class _Error = _Err)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_same, remove_cvref_t<_Error>, unexpected)
                         && !_CCCL_TRAIT(is_same, remove_cvref_t<_Error>, in_place_t)
                         && _CCCL_TRAIT(is_constructible, _Err, _Error)))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit unexpected(_Error&& __error) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit unexpected(_Error&& __error) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Error))
       : __unex_(_CUDA_VSTD::forward<_Error>(__error))
   {}
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _Args...))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit unexpected(in_place_t, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit unexpected(in_place_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __unex_(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Up, class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, initializer_list<_Up>&, _Args...))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit unexpected(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit unexpected(
     in_place_t,
     initializer_list<_Up> __il,
     _Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Err, initializer_list<_Up>&, _Args...))
       : __unex_(__il, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
-  constexpr unexpected& operator=(const unexpected&) = default;
-  constexpr unexpected& operator=(unexpected&&)      = default;
+  _CCCL_HIDE_FROM_ABI constexpr unexpected& operator=(const unexpected&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr unexpected& operator=(unexpected&&)      = default;
 
   // [expected.un.obs]
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Err& error() const& noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Err& error() const& noexcept
   {
     return __unex_;
   }
 
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr _Err& error() & noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Err& error() & noexcept
   {
     return __unex_;
   }
 
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Err&& error() const&& noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Err&& error() const&& noexcept
   {
     return _CUDA_VSTD::move(__unex_);
   }
 
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr _Err&& error() && noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Err&& error() && noexcept
   {
     return _CUDA_VSTD::move(__unex_);
   }
 
   // [expected.un.swap]
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void
-  swap(unexpected& __other) noexcept(_CCCL_TRAIT(is_nothrow_swappable, _Err))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(unexpected& __other) noexcept(_CCCL_TRAIT(is_nothrow_swappable, _Err))
   {
     static_assert(_CCCL_TRAIT(is_swappable, _Err), "E must be swappable");
     using _CUDA_VSTD::swap;
@@ -133,7 +132,7 @@ class unexpected
 
   _LIBCUDACXX_TEMPLATE(class _Err2 = _Err)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_swappable, _Err2))
-  friend _LIBCUDACXX_INLINE_VISIBILITY constexpr void
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr void
   swap(unexpected& __lhs, unexpected& __rhs) noexcept(_CCCL_TRAIT(is_nothrow_swappable, _Err2))
   {
     __lhs.swap(__rhs);
@@ -142,7 +141,7 @@ class unexpected
 
   // [expected.un.eq]
   template <class _UErr>
-  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator==(const unexpected& __lhs,
              const unexpected<_UErr>& __rhs) noexcept(noexcept(static_cast<bool>(__lhs.error() == __rhs.error())))
   {
@@ -150,7 +149,7 @@ class unexpected
   }
 #  if _CCCL_STD_VER < 2020
   template <class _UErr>
-  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator!=(const unexpected& __lhs,
              const unexpected<_UErr>& __rhs) noexcept(noexcept(static_cast<bool>(__lhs.error() != __rhs.error())))
   {
diff --git a/libcudacxx/include/cuda/std/__functional/binary_function.h b/libcudacxx/include/cuda/std/__functional/binary_function.h
index ecb6dfecd2..af7230678b 100644
--- a/libcudacxx/include/cuda/std/__functional/binary_function.h
+++ b/libcudacxx/include/cuda/std/__functional/binary_function.h
@@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if _CCCL_STD_VER <= 2014 || defined(_LIBCUDACXX_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION)
 
 template <class _Arg1, class _Arg2, class _Result>
-struct _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 binary_function
+struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 binary_function
 {
   typedef _Arg1 first_argument_type;
   typedef _Arg2 second_argument_type;
diff --git a/libcudacxx/include/cuda/std/__functional/binary_negate.h b/libcudacxx/include/cuda/std/__functional/binary_negate.h
index 99b36c9029..84d3711e86 100644
--- a/libcudacxx/include/cuda/std/__functional/binary_negate.h
+++ b/libcudacxx/include/cuda/std/__functional/binary_negate.h
@@ -28,18 +28,18 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if _CCCL_STD_VER <= 2017 || defined(_LIBCUDACXX_ENABLE_CXX20_REMOVED_NEGATORS)
 
 template <class _Predicate>
-class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX17 binary_negate
+class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX17 binary_negate
     : public __binary_function<typename _Predicate::first_argument_type, typename _Predicate::second_argument_type, bool>
 {
   _Predicate __pred_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit _CCCL_CONSTEXPR_CXX14 binary_negate(const _Predicate& __pred)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit _CCCL_CONSTEXPR_CXX14 binary_negate(const _Predicate& __pred)
       : __pred_(__pred)
   {}
 
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool operator()(
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI bool operator()(
     const typename _Predicate::first_argument_type& __x, const typename _Predicate::second_argument_type& __y) const
   {
     return !__pred_(__x, __y);
@@ -48,7 +48,7 @@ class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX17 binary_negate
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Predicate>
-_LIBCUDACXX_DEPRECATED_IN_CXX17 inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY binary_negate<_Predicate>
+_LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 binary_negate<_Predicate>
 not2(const _Predicate& __pred)
 {
   return binary_negate<_Predicate>(__pred);
diff --git a/libcudacxx/include/cuda/std/__functional/bind.h b/libcudacxx/include/cuda/std/__functional/bind.h
index fd7970a3e7..14b68de5c0 100644
--- a/libcudacxx/include/cuda/std/__functional/bind.h
+++ b/libcudacxx/include/cuda/std/__functional/bind.h
@@ -76,16 +76,16 @@ struct __ph
 {};
 
 #  if defined(_LIBCUDACXX_BUILDING_LIBRARY)
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<1> _1;
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<2> _2;
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<3> _3;
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<4> _4;
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<5> _5;
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<6> _6;
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<7> _7;
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<8> _8;
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<9> _9;
-_LIBCUDACXX_INLINE_VISIBILITY extern const __ph<10> _10;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<1> _1;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<2> _2;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<3> _3;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<4> _4;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<5> _5;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<6> _6;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<7> _7;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<8> _8;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<9> _9;
+_LIBCUDACXX_HIDE_FROM_ABI extern const __ph<10> _10;
 #  else
 /* _LIBCUDACXX_INLINE_VAR */ constexpr __ph<1> _1{};
 /* _LIBCUDACXX_INLINE_VAR */ constexpr __ph<2> _2{};
@@ -106,20 +106,20 @@ struct is_placeholder<placeholders::__ph<_Np>> : public integral_constant<int, _
 {};
 
 template <class _Tp, class _Uj>
-inline _LIBCUDACXX_INLINE_VISIBILITY _Tp& __mu(reference_wrapper<_Tp> __t, _Uj&)
+_LIBCUDACXX_HIDE_FROM_ABI _Tp& __mu(reference_wrapper<_Tp> __t, _Uj&)
 {
   return __t.get();
 }
 
 template <class _Ti, class... _Uj, size_t... _Indx>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename __invoke_of<_Ti&, _Uj...>::type
+_LIBCUDACXX_HIDE_FROM_ABI typename __invoke_of<_Ti&, _Uj...>::type
 __mu_expand(_Ti& __ti, tuple<_Uj...>& __uj, __tuple_indices<_Indx...>)
 {
   return __ti(_CUDA_VSTD::forward<_Uj>(_CUDA_VSTD::get<_Indx>(__uj))...);
 }
 
 template <class _Ti, class... _Uj>
-inline _LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_bind_expression<_Ti>::value, __invoke_of<_Ti&, _Uj...>>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_bind_expression<_Ti>::value, __invoke_of<_Ti&, _Uj...>>
 __mu(_Ti& __ti, tuple<_Uj...>& __uj)
 {
   typedef __make_tuple_indices_t<sizeof...(_Uj)> __indices;
@@ -137,7 +137,7 @@ struct __mu_return2<true, _Ti, _Uj>
 };
 
 template <class _Ti, class _Uj>
-inline _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<0 < is_placeholder<_Ti>::value, typename __mu_return2<0 < is_placeholder<_Ti>::value, _Ti, _Uj>::type>
 __mu(_Ti&, _Uj& __uj)
 {
@@ -146,7 +146,7 @@ __mu(_Ti&, _Uj& __uj)
 }
 
 template <class _Ti, class _Uj>
-inline _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<!is_bind_expression<_Ti>::value && is_placeholder<_Ti>::value == 0 && !__is_reference_wrapper<_Ti>::value,
               _Ti&>
 __mu(_Ti& __ti, _Uj&)
@@ -239,7 +239,7 @@ template <class _Fp, class _BoundArgs, class _TupleUj>
 using __bind_return_t = typename __bind_return<_Fp, _BoundArgs, _TupleUj>::type;
 
 template <class _Fp, class _BoundArgs, size_t... _Indx, class _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY __bind_return_t<_Fp, _BoundArgs, _Args>
+_LIBCUDACXX_HIDE_FROM_ABI __bind_return_t<_Fp, _BoundArgs, _Args>
 __apply_functor(_Fp& __f, _BoundArgs& __bound_args, __tuple_indices<_Indx...>, _Args&& __args)
 {
   return _CUDA_VSTD::__invoke(__f, _CUDA_VSTD::__mu(_CUDA_VSTD::get<_Indx>(__bound_args), __args)...);
@@ -263,13 +263,13 @@ class __bind : public __weak_result_type<__decay_t<_Fp>>
             class... _BA,
             class = __enable_if_t<is_constructible<_Fd, _Gp>::value
                                   && !is_same<__libcpp_remove_reference_t<_Gp>, __bind>::value>>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 explicit __bind(_Gp&& __f, _BA&&... __bound_args)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit __bind(_Gp&& __f, _BA&&... __bound_args)
       : __f_(_CUDA_VSTD::forward<_Gp>(__f))
       , __bound_args_(_CUDA_VSTD::forward<_BA>(__bound_args)...)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __bind_return_t<_Fd, _Td, tuple<_Args&&...>>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __bind_return_t<_Fd, _Td, tuple<_Args&&...>>
   operator()(_Args&&... __args)
   {
     return _CUDA_VSTD::__apply_functor(
@@ -277,7 +277,7 @@ class __bind : public __weak_result_type<__decay_t<_Fp>>
   }
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __bind_return_t<const _Fd, const _Td, tuple<_Args&&...>>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __bind_return_t<const _Fd, const _Td, tuple<_Args&&...>>
   operator()(_Args&&... __args) const
   {
     return _CUDA_VSTD::__apply_functor(
@@ -303,12 +303,12 @@ class __bind_r : public __bind<_Fp, _BoundArgs...>
             class... _BA,
             class = __enable_if_t<is_constructible<_Fd, _Gp>::value
                                   && !is_same<__libcpp_remove_reference_t<_Gp>, __bind_r>::value>>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 explicit __bind_r(_Gp&& __f, _BA&&... __bound_args)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit __bind_r(_Gp&& __f, _BA&&... __bound_args)
       : base(_CUDA_VSTD::forward<_Gp>(__f), _CUDA_VSTD::forward<_BA>(__bound_args)...)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20
   __enable_if_t<is_convertible<__bind_return_t<_Fd, _Td, tuple<_Args&&...>>, result_type>::value || is_void<_Rp>::value,
                 result_type>
   operator()(_Args&&... __args)
@@ -318,7 +318,7 @@ class __bind_r : public __bind<_Fp, _BoundArgs...>
   }
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __enable_if_t<
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __enable_if_t<
     is_convertible<__bind_return_t<const _Fd, const _Td, tuple<_Args&&...>>, result_type>::value || is_void<_Rp>::value,
     result_type>
   operator()(_Args&&... __args) const
@@ -333,15 +333,14 @@ struct is_bind_expression<__bind_r<_Rp, _Fp, _BoundArgs...>> : public true_type
 {};
 
 template <class _Fp, class... _BoundArgs>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __bind<_Fp, _BoundArgs...>
-bind(_Fp&& __f, _BoundArgs&&... __bound_args)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __bind<_Fp, _BoundArgs...> bind(_Fp&& __f, _BoundArgs&&... __bound_args)
 {
   typedef __bind<_Fp, _BoundArgs...> type;
   return type(_CUDA_VSTD::forward<_Fp>(__f), _CUDA_VSTD::forward<_BoundArgs>(__bound_args)...);
 }
 
 template <class _Rp, class _Fp, class... _BoundArgs>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __bind_r<_Rp, _Fp, _BoundArgs...>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __bind_r<_Rp, _Fp, _BoundArgs...>
 bind(_Fp&& __f, _BoundArgs&&... __bound_args)
 {
   typedef __bind_r<_Rp, _Fp, _BoundArgs...> type;
diff --git a/libcudacxx/include/cuda/std/__functional/bind_back.h b/libcudacxx/include/cuda/std/__functional/bind_back.h
index cadf9d22ae..e872bb58fb 100644
--- a/libcudacxx/include/cuda/std/__functional/bind_back.h
+++ b/libcudacxx/include/cuda/std/__functional/bind_back.h
@@ -44,8 +44,7 @@ template <size_t _NBound, size_t... _Ip>
 struct __bind_back_op<_NBound, index_sequence<_Ip...>>
 {
   template <class _Fn, class _BoundArgs, class... _Args>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
-  operator()(_Fn&& __f, _BoundArgs&& __bound_args, _Args&&... __args) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Fn&& __f, _BoundArgs&& __bound_args, _Args&&... __args) const
     noexcept(noexcept(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fn>(__f),
                                          _CUDA_VSTD::forward<_Args>(__args)...,
                                          _CUDA_VSTD::get<_Ip>(_CUDA_VSTD::forward<_BoundArgs>(__bound_args))...)))
@@ -71,7 +70,7 @@ template <class _Fn,
                                    is_move_constructible<decay_t<_Fn>>,
                                    is_constructible<decay_t<_Args>, _Args>...,
                                    is_move_constructible<decay_t<_Args>>...>::value>>
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto
 __bind_back(_Fn&& __f, _Args&&... __args) noexcept(noexcept(__bind_back_t<decay_t<_Fn>, tuple<decay_t<_Args>...>>(
   _CUDA_VSTD::forward<_Fn>(__f), _CUDA_VSTD::forward_as_tuple(_CUDA_VSTD::forward<_Args>(__args)...))))
   -> decltype(__bind_back_t<decay_t<_Fn>, tuple<decay_t<_Args>...>>(
diff --git a/libcudacxx/include/cuda/std/__functional/bind_front.h b/libcudacxx/include/cuda/std/__functional/bind_front.h
index dc2f643e37..5ddbff62d3 100644
--- a/libcudacxx/include/cuda/std/__functional/bind_front.h
+++ b/libcudacxx/include/cuda/std/__functional/bind_front.h
@@ -38,7 +38,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 struct __bind_front_op
 {
   template <class... _Args>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Args&&... __args) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Args&&... __args) const
     noexcept(noexcept(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Args>(__args)...)))
       -> decltype(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Args>(__args)...))
   {
@@ -51,10 +51,10 @@ struct __bind_front_t : __perfect_forward<__bind_front_op, _Fn, _BoundArgs...>
 {
   using __base = __perfect_forward<__bind_front_op, _Fn, _BoundArgs...>;
 #  if defined(_CCCL_COMPILER_NVRTC)
-  constexpr __bind_front_t() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __bind_front_t() noexcept = default;
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bind_front_t(_Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __bind_front_t(_Args&&... __args) noexcept(
     noexcept(__base(_CUDA_VSTD::declval<_Args>()...)))
       : __base(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
@@ -70,7 +70,7 @@ _LIBCUDACXX_CONCEPT __can_bind_front =
 
 _LIBCUDACXX_TEMPLATE(class _Fn, class... _Args)
 _LIBCUDACXX_REQUIRES(__can_bind_front<_Fn, _Args...>)
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto
 bind_front(_Fn&& __f, _Args&&... __args) noexcept(is_nothrow_constructible_v<tuple<decay_t<_Args>...>, _Args&&...>)
 {
   return __bind_front_t<decay_t<_Fn>, decay_t<_Args>...>(
diff --git a/libcudacxx/include/cuda/std/__functional/binder1st.h b/libcudacxx/include/cuda/std/__functional/binder1st.h
index 4d639806b4..6a6c075d4a 100644
--- a/libcudacxx/include/cuda/std/__functional/binder1st.h
+++ b/libcudacxx/include/cuda/std/__functional/binder1st.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 
 template <class __Operation>
-class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 binder1st
+class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 binder1st
     : public __unary_function<typename __Operation::second_argument_type, typename __Operation::result_type>
 {
 protected:
@@ -38,18 +38,18 @@ class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 binder1st
   typename __Operation::first_argument_type value;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY binder1st(const __Operation& __x, const typename __Operation::first_argument_type __y)
+  _LIBCUDACXX_HIDE_FROM_ABI binder1st(const __Operation& __x, const typename __Operation::first_argument_type __y)
       : op(__x)
       , value(__y)
   {}
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY typename __Operation::result_type
+  _LIBCUDACXX_HIDE_FROM_ABI typename __Operation::result_type
   operator()(typename __Operation::second_argument_type& __x) const
   {
     return op(value, __x);
   }
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY typename __Operation::result_type
+  _LIBCUDACXX_HIDE_FROM_ABI typename __Operation::result_type
   operator()(const typename __Operation::second_argument_type& __x) const
   {
     return op(value, __x);
@@ -57,7 +57,7 @@ class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 binder1st
 };
 
 template <class __Operation, class _Tp>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY binder1st<__Operation>
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI binder1st<__Operation>
 bind1st(const __Operation& __op, const _Tp& __x)
 {
   return binder1st<__Operation>(__op, __x);
diff --git a/libcudacxx/include/cuda/std/__functional/binder2nd.h b/libcudacxx/include/cuda/std/__functional/binder2nd.h
index adc3ede88e..912ad20f63 100644
--- a/libcudacxx/include/cuda/std/__functional/binder2nd.h
+++ b/libcudacxx/include/cuda/std/__functional/binder2nd.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 
 template <class __Operation>
-class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 binder2nd
+class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 binder2nd
     : public __unary_function<typename __Operation::first_argument_type, typename __Operation::result_type>
 {
 protected:
@@ -38,18 +38,18 @@ class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 binder2nd
   typename __Operation::second_argument_type value;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY binder2nd(const __Operation& __x, const typename __Operation::second_argument_type __y)
+  _LIBCUDACXX_HIDE_FROM_ABI binder2nd(const __Operation& __x, const typename __Operation::second_argument_type __y)
       : op(__x)
       , value(__y)
   {}
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY typename __Operation::result_type
+  _LIBCUDACXX_HIDE_FROM_ABI typename __Operation::result_type
   operator()(typename __Operation::first_argument_type& __x) const
   {
     return op(__x, value);
   }
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY typename __Operation::result_type
+  _LIBCUDACXX_HIDE_FROM_ABI typename __Operation::result_type
   operator()(const typename __Operation::first_argument_type& __x) const
   {
     return op(__x, value);
@@ -57,7 +57,7 @@ class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 binder2nd
 };
 
 template <class __Operation, class _Tp>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY binder2nd<__Operation>
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI binder2nd<__Operation>
 bind2nd(const __Operation& __op, const _Tp& __x)
 {
   return binder2nd<__Operation>(__op, __x);
diff --git a/libcudacxx/include/cuda/std/__functional/compose.h b/libcudacxx/include/cuda/std/__functional/compose.h
index eff9679e35..a6c1c44fff 100644
--- a/libcudacxx/include/cuda/std/__functional/compose.h
+++ b/libcudacxx/include/cuda/std/__functional/compose.h
@@ -33,13 +33,13 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 struct __compose_op
 {
   template <class _Fn1, class _Fn2, class... _Args>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
-  operator()(_Fn1&& __f1, _Fn2&& __f2, _Args&&... __args) const noexcept(noexcept(_CUDA_VSTD::invoke(
-    _CUDA_VSTD::forward<_Fn1>(__f1),
-    _CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fn2>(__f2), _CUDA_VSTD::forward<_Args>(__args)...))))
-    -> decltype(_CUDA_VSTD::invoke(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Fn1&& __f1, _Fn2&& __f2, _Args&&... __args) const
+    noexcept(noexcept(_CUDA_VSTD::invoke(
       _CUDA_VSTD::forward<_Fn1>(__f1),
-      _CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fn2>(__f2), _CUDA_VSTD::forward<_Args>(__args)...)))
+      _CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fn2>(__f2), _CUDA_VSTD::forward<_Args>(__args)...))))
+      -> decltype(_CUDA_VSTD::invoke(
+        _CUDA_VSTD::forward<_Fn1>(__f1),
+        _CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fn2>(__f2), _CUDA_VSTD::forward<_Args>(__args)...)))
   {
     return _CUDA_VSTD::invoke(
       _CUDA_VSTD::forward<_Fn1>(__f1),
@@ -54,7 +54,7 @@ struct __compose_t : __perfect_forward<__compose_op, _Fn1, _Fn2>
 };
 
 template <class _Fn1, class _Fn2>
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto __compose(_Fn1&& __f1, _Fn2&& __f2) noexcept(
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto __compose(_Fn1&& __f1, _Fn2&& __f2) noexcept(
   noexcept(__compose_t<decay_t<_Fn1>, decay_t<_Fn2>>(_CUDA_VSTD::forward<_Fn1>(__f1), _CUDA_VSTD::forward<_Fn2>(__f2))))
   -> decltype(__compose_t<decay_t<_Fn1>, decay_t<_Fn2>>(
     _CUDA_VSTD::forward<_Fn1>(__f1), _CUDA_VSTD::forward<_Fn2>(__f2)))
diff --git a/libcudacxx/include/cuda/std/__functional/default_searcher.h b/libcudacxx/include/cuda/std/__functional/default_searcher.h
index a5d2817b34..13ab0da40d 100644
--- a/libcudacxx/include/cuda/std/__functional/default_searcher.h
+++ b/libcudacxx/include/cuda/std/__functional/default_searcher.h
@@ -35,10 +35,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // default searcher
 template <class _ForwardIterator, class _BinaryPredicate = equal_to<>>
-class _LIBCUDACXX_TEMPLATE_VIS default_searcher
+class _CCCL_TYPE_VISIBILITY_DEFAULT default_searcher
 {
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20
   default_searcher(_ForwardIterator __f, _ForwardIterator __l, _BinaryPredicate __p = _BinaryPredicate())
       : __first_(__f)
       , __last_(__l)
@@ -46,7 +46,7 @@ class _LIBCUDACXX_TEMPLATE_VIS default_searcher
   {}
 
   template <typename _ForwardIterator2>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 pair<_ForwardIterator2, _ForwardIterator2>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 pair<_ForwardIterator2, _ForwardIterator2>
   operator()(_ForwardIterator2 __f, _ForwardIterator2 __l) const
   {
     return _CUDA_VSTD::__search(
diff --git a/libcudacxx/include/cuda/std/__functional/function.h b/libcudacxx/include/cuda/std/__functional/function.h
index 6460ab7f0e..719bdcbd23 100644
--- a/libcudacxx/include/cuda/std/__functional/function.h
+++ b/libcudacxx/include/cuda/std/__functional/function.h
@@ -26,10 +26,13 @@
 #include <cuda/std/__functional/invoke.h>
 #include <cuda/std/__functional/unary_function.h>
 #include <cuda/std/__iterator/iterator_traits.h>
+#include <cuda/std/__memory/allocator_arg_t.h>
 #include <cuda/std/__memory/allocator_destructor.h>
 #include <cuda/std/__memory/allocator_traits.h>
 #include <cuda/std/__memory/builtin_new_allocator.h>
 #include <cuda/std/__memory/compressed_pair.h>
+#include <cuda/std/__memory/uses_allocator.h>
+#include <cuda/std/__new_>
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/decay.h>
 #include <cuda/std/__type_traits/enable_if.h>
@@ -46,27 +49,25 @@
 #include <cuda/std/__utility/swap.h>
 #include <cuda/std/detail/libcxx/include/__assert>
 #include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/detail/libcxx/include/__functional_base>
-#include <cuda/std/detail/libcxx/include/new>
 #include <cuda/std/tuple>
 
 #ifndef __cuda_std__
 
-#  ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#  ifndef _CCCL_NO_EXCEPTIONS
 #    include <function>
-#  endif // _LIBCUDACXX_NO_EXCEPTIONS
+#  endif // _CCCL_NO_EXCEPTIONS
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_bad_function_call()
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_bad_function_call()
 {
-#  ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#  ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::std::bad_function_call();), (_CUDA_VSTD_NOVERSION::terminate();))
-#  else
+#  else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   _CUDA_VSTD_NOVERSION::terminate();
-#  endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#  endif // _CCCL_NO_EXCEPTIONS
 }
 
 template <class _Fp>
-class _LIBCUDACXX_TEMPLATE_VIS function; // undefined
+class _CCCL_TYPE_VISIBILITY_DEFAULT function; // undefined
 
 namespace __function
 {
@@ -88,32 +89,32 @@ struct __maybe_derive_from_binary_function<_Rp(_A1, _A2)> : public __binary_func
 {};
 
 template <class _Fp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __not_null(_Fp const&)
+_LIBCUDACXX_HIDE_FROM_ABI bool __not_null(_Fp const&)
 {
   return true;
 }
 
 template <class _Fp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __not_null(_Fp* __ptr)
+_LIBCUDACXX_HIDE_FROM_ABI bool __not_null(_Fp* __ptr)
 {
   return __ptr;
 }
 
 template <class _Ret, class _Class>
-_LIBCUDACXX_INLINE_VISIBILITY bool __not_null(_Ret _Class::*__ptr)
+_LIBCUDACXX_HIDE_FROM_ABI bool __not_null(_Ret _Class::*__ptr)
 {
   return __ptr;
 }
 
 template <class _Fp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __not_null(function<_Fp> const& __f)
+_LIBCUDACXX_HIDE_FROM_ABI bool __not_null(function<_Fp> const& __f)
 {
   return !!__f;
 }
 
 #  ifdef _LIBCUDACXX_HAS_EXTENSION_BLOCKS
 template <class _Rp, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY bool __not_null(_Rp (^__p)(_Args...))
+_LIBCUDACXX_HIDE_FROM_ABI bool __not_null(_Rp (^__p)(_Args...))
 {
   return __p;
 }
@@ -140,42 +141,42 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)>
   typedef _LIBCUDACXX_NODEBUG_TYPE _Fp _Target;
   typedef _LIBCUDACXX_NODEBUG_TYPE _Ap _Alloc;
 
-  _LIBCUDACXX_INLINE_VISIBILITY const _Target& __target() const
+  _LIBCUDACXX_HIDE_FROM_ABI const _Target& __target() const
   {
     return __f_.first();
   }
 
   // WIN32 APIs may define __allocator, so use __get_allocator instead.
-  _LIBCUDACXX_INLINE_VISIBILITY const _Alloc& __get_allocator() const
+  _LIBCUDACXX_HIDE_FROM_ABI const _Alloc& __get_allocator() const
   {
     return __f_.second();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __alloc_func(_Target&& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __alloc_func(_Target&& __f)
       : __f_(piecewise_construct, _CUDA_VSTD::forward_as_tuple(_CUDA_VSTD::move(__f)), _CUDA_VSTD::forward_as_tuple())
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __alloc_func(const _Target& __f, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __alloc_func(const _Target& __f, const _Alloc& __a)
       : __f_(piecewise_construct, _CUDA_VSTD::forward_as_tuple(__f), _CUDA_VSTD::forward_as_tuple(__a))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __alloc_func(const _Target& __f, _Alloc&& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __alloc_func(const _Target& __f, _Alloc&& __a)
       : __f_(piecewise_construct, _CUDA_VSTD::forward_as_tuple(__f), _CUDA_VSTD::forward_as_tuple(_CUDA_VSTD::move(__a)))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __alloc_func(_Target&& __f, _Alloc&& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __alloc_func(_Target&& __f, _Alloc&& __a)
       : __f_(piecewise_construct,
              _CUDA_VSTD::forward_as_tuple(_CUDA_VSTD::move(__f)),
              _CUDA_VSTD::forward_as_tuple(_CUDA_VSTD::move(__a)))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Rp operator()(_ArgTypes&&... __arg)
+  _LIBCUDACXX_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg)
   {
     typedef __invoke_void_return_wrapper<_Rp> _Invoker;
     return _Invoker::__call(__f_.first(), _CUDA_VSTD::forward<_ArgTypes>(__arg)...);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __alloc_func* __clone() const
+  _LIBCUDACXX_HIDE_FROM_ABI __alloc_func* __clone() const
   {
     typedef allocator_traits<_Alloc> __alloc_traits;
     typedef typename __rebind_alloc_helper<__alloc_traits, __alloc_func>::type _AA;
@@ -186,7 +187,7 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)>
     return __hold.release();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void destroy() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void destroy() noexcept
   {
     __f_.~__compressed_pair<_Target, _Alloc>();
   }
@@ -209,26 +210,26 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)>
 public:
   typedef _LIBCUDACXX_NODEBUG_TYPE _Fp _Target;
 
-  _LIBCUDACXX_INLINE_VISIBILITY const _Target& __target() const
+  _LIBCUDACXX_HIDE_FROM_ABI const _Target& __target() const
   {
     return __f_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __default_alloc_func(_Target&& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __default_alloc_func(_Target&& __f)
       : __f_(_CUDA_VSTD::move(__f))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __default_alloc_func(const _Target& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __default_alloc_func(const _Target& __f)
       : __f_(__f)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Rp operator()(_ArgTypes&&... __arg)
+  _LIBCUDACXX_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg)
   {
     typedef __invoke_void_return_wrapper<_Rp> _Invoker;
     return _Invoker::__call(__f_, _CUDA_VSTD::forward<_ArgTypes>(__arg)...);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __default_alloc_func* __clone() const
+  _LIBCUDACXX_HIDE_FROM_ABI __default_alloc_func* __clone() const
   {
     __builtin_new_allocator::__holder_t __hold = __builtin_new_allocator::__allocate_type<__default_alloc_func>(1);
     __default_alloc_func* __res                = ::new ((void*) __hold.get()) __default_alloc_func(__f_);
@@ -236,7 +237,7 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)>
     return __res;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void destroy() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void destroy() noexcept
   {
     __f_.~_Target();
   }
@@ -251,7 +252,7 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)>
 // __base provides an abstract interface for copyable functors.
 
 template <class _Fp>
-class _LIBCUDACXX_TEMPLATE_VIS __base;
+class _CCCL_TYPE_VISIBILITY_DEFAULT __base;
 
 template <class _Rp, class... _ArgTypes>
 class __base<_Rp(_ArgTypes...)>
@@ -260,8 +261,8 @@ class __base<_Rp(_ArgTypes...)>
   __base& operator=(const __base&);
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __base() {}
-  _LIBCUDACXX_INLINE_VISIBILITY virtual ~__base() {}
+  _LIBCUDACXX_HIDE_FROM_ABI __base() {}
+  _LIBCUDACXX_HIDE_FROM_ABI virtual ~__base() {}
   virtual __base* __clone() const            = 0;
   virtual void __clone(__base*) const        = 0;
   virtual void destroy() noexcept            = 0;
@@ -284,19 +285,19 @@ class __func<_Fp, _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)>
   __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)> __f_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __func(_Fp&& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __func(_Fp&& __f)
       : __f_(_CUDA_VSTD::move(__f))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __func(const _Fp& __f, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __func(const _Fp& __f, const _Alloc& __a)
       : __f_(__f, __a)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __func(const _Fp& __f, _Alloc&& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __func(const _Fp& __f, _Alloc&& __a)
       : __f_(__f, _CUDA_VSTD::move(__a))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __func(_Fp&& __f, _Alloc&& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __func(_Fp&& __f, _Alloc&& __a)
       : __f_(_CUDA_VSTD::move(__f), _CUDA_VSTD::move(__a))
   {}
 
@@ -390,12 +391,12 @@ class __value_func<_Rp(_ArgTypes...)>
   }
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __value_func() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI __value_func() noexcept
       : __f_(nullptr)
   {}
 
   template <class _Fp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY __value_func(_Fp&& __f, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI __value_func(_Fp&& __f, const _Alloc& __a)
       : __f_(nullptr)
   {
     typedef allocator_traits<_Alloc> __alloc_traits;
@@ -421,11 +422,11 @@ class __value_func<_Rp(_ArgTypes...)>
   }
 
   template <class _Fp, class = __enable_if_t<!is_same<__decay_t<_Fp>, __value_func>::value>>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __value_func(_Fp&& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __value_func(_Fp&& __f)
       : __value_func(_CUDA_VSTD::forward<_Fp>(__f), allocator<_Fp>())
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY __value_func(const __value_func& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI __value_func(const __value_func& __f)
   {
     if (__f.__f_ == nullptr)
     {
@@ -442,7 +443,7 @@ class __value_func<_Rp(_ArgTypes...)>
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __value_func(__value_func&& __f) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI __value_func(__value_func&& __f) noexcept
   {
     if (__f.__f_ == nullptr)
     {
@@ -460,7 +461,7 @@ class __value_func<_Rp(_ArgTypes...)>
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY ~__value_func()
+  _LIBCUDACXX_HIDE_FROM_ABI ~__value_func()
   {
     if ((void*) __f_ == &__buf_)
     {
@@ -472,7 +473,7 @@ class __value_func<_Rp(_ArgTypes...)>
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __value_func& operator=(__value_func&& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI __value_func& operator=(__value_func&& __f)
   {
     *this = nullptr;
     if (__f.__f_ == nullptr)
@@ -492,7 +493,7 @@ class __value_func<_Rp(_ArgTypes...)>
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __value_func& operator=(nullptr_t)
+  _LIBCUDACXX_HIDE_FROM_ABI __value_func& operator=(nullptr_t)
   {
     __func* __f = __f_;
     __f_        = nullptr;
@@ -507,7 +508,7 @@ class __value_func<_Rp(_ArgTypes...)>
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Rp operator()(_ArgTypes&&... __args) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __args) const
   {
     if (__f_ == nullptr)
     {
@@ -516,7 +517,7 @@ class __value_func<_Rp(_ArgTypes...)>
     return (*__f_)(_CUDA_VSTD::forward<_ArgTypes>(__args)...);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void swap(__value_func& __f) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void swap(__value_func& __f) noexcept
   {
     if (&__f == this)
     {
@@ -557,13 +558,13 @@ class __value_func<_Rp(_ArgTypes...)>
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_EXPLICIT operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_EXPLICIT operator bool() const noexcept
   {
     return __f_ != nullptr;
   }
 
 #  ifndef _LIBCUDACXX_NO_RTTI
-  _LIBCUDACXX_INLINE_VISIBILITY const type_info& target_type() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI const type_info& target_type() const noexcept
   {
     if (__f_ == nullptr)
     {
@@ -573,7 +574,7 @@ class __value_func<_Rp(_ArgTypes...)>
   }
 
   template <typename _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY const _Tp* target() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI const _Tp* target() const noexcept
   {
     if (__f_ == nullptr)
     {
@@ -618,12 +619,12 @@ struct __policy
   // Returns a pointer to a static policy object suitable for the functor
   // type.
   template <typename _Fun>
-  _LIBCUDACXX_INLINE_VISIBILITY static const __policy* __create()
+  _LIBCUDACXX_HIDE_FROM_ABI static const __policy* __create()
   {
     return __choose_policy<_Fun>(__use_small_storage<_Fun>());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static const __policy* __create_empty()
+  _LIBCUDACXX_HIDE_FROM_ABI static const __policy* __create_empty()
   {
     static const constexpr __policy __policy_ = {
       nullptr,
@@ -653,7 +654,7 @@ struct __policy
   }
 
   template <typename _Fun>
-  _LIBCUDACXX_INLINE_VISIBILITY static const __policy* __choose_policy(/* is_small = */ false_type)
+  _LIBCUDACXX_HIDE_FROM_ABI static const __policy* __choose_policy(/* is_small = */ false_type)
   {
     static const constexpr __policy __policy_ = {
       &__large_clone<_Fun>,
@@ -669,7 +670,7 @@ struct __policy
   }
 
   template <typename _Fun>
-  _LIBCUDACXX_INLINE_VISIBILITY static const __policy* __choose_policy(/* is_small = */ true_type)
+  _LIBCUDACXX_HIDE_FROM_ABI static const __policy* __choose_policy(/* is_small = */ true_type)
   {
     static const constexpr __policy __policy_ = {
       nullptr,
@@ -703,19 +704,19 @@ struct __policy_invoker<_Rp(_ArgTypes...)>
   __Call __call_;
 
   // Creates an invoker that throws bad_function_call.
-  _LIBCUDACXX_INLINE_VISIBILITY __policy_invoker()
+  _LIBCUDACXX_HIDE_FROM_ABI __policy_invoker()
       : __call_(&__call_empty)
   {}
 
   // Creates an invoker that calls the given instance of __func.
   template <typename _Fun>
-  _LIBCUDACXX_INLINE_VISIBILITY static __policy_invoker __create()
+  _LIBCUDACXX_HIDE_FROM_ABI static __policy_invoker __create()
   {
     return __policy_invoker(&__call_impl<_Fun>);
   }
 
 private:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __policy_invoker(__Call __c)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __policy_invoker(__Call __c)
       : __call_(__c)
   {}
 
@@ -755,12 +756,12 @@ class __policy_func<_Rp(_ArgTypes...)>
   const __policy* __policy_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __policy_func()
+  _LIBCUDACXX_HIDE_FROM_ABI __policy_func()
       : __policy_(__policy::__create_empty())
   {}
 
   template <class _Fp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY __policy_func(_Fp&& __f, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI __policy_func(_Fp&& __f, const _Alloc& __a)
       : __policy_(__policy::__create_empty())
   {
     typedef __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun;
@@ -788,7 +789,7 @@ class __policy_func<_Rp(_ArgTypes...)>
   }
 
   template <class _Fp, class = __enable_if_t<!is_same<__decay_t<_Fp>, __policy_func>::value>>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __policy_func(_Fp&& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __policy_func(_Fp&& __f)
       : __policy_(__policy::__create_empty())
   {
     typedef __default_alloc_func<_Fp, _Rp(_ArgTypes...)> _Fun;
@@ -810,7 +811,7 @@ class __policy_func<_Rp(_ArgTypes...)>
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __policy_func(const __policy_func& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI __policy_func(const __policy_func& __f)
       : __buf_(__f.__buf_)
       , __invoker_(__f.__invoker_)
       , __policy_(__f.__policy_)
@@ -821,7 +822,7 @@ class __policy_func<_Rp(_ArgTypes...)>
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __policy_func(__policy_func&& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI __policy_func(__policy_func&& __f)
       : __buf_(__f.__buf_)
       , __invoker_(__f.__invoker_)
       , __policy_(__f.__policy_)
@@ -833,7 +834,7 @@ class __policy_func<_Rp(_ArgTypes...)>
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY ~__policy_func()
+  _LIBCUDACXX_HIDE_FROM_ABI ~__policy_func()
   {
     if (__policy_->__destroy)
     {
@@ -841,7 +842,7 @@ class __policy_func<_Rp(_ArgTypes...)>
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __policy_func& operator=(__policy_func&& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI __policy_func& operator=(__policy_func&& __f)
   {
     *this          = nullptr;
     __buf_         = __f.__buf_;
@@ -852,7 +853,7 @@ class __policy_func<_Rp(_ArgTypes...)>
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY __policy_func& operator=(nullptr_t)
+  _LIBCUDACXX_HIDE_FROM_ABI __policy_func& operator=(nullptr_t)
   {
     const __policy* __p = __policy_;
     __policy_           = __policy::__create_empty();
@@ -864,31 +865,31 @@ class __policy_func<_Rp(_ArgTypes...)>
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Rp operator()(_ArgTypes&&... __args) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __args) const
   {
     return __invoker_.__call_(_CUDA_VSTD::addressof(__buf_), _CUDA_VSTD::forward<_ArgTypes>(__args)...);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void swap(__policy_func& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI void swap(__policy_func& __f)
   {
     _CUDA_VSTD::swap(__invoker_, __f.__invoker_);
     _CUDA_VSTD::swap(__policy_, __f.__policy_);
     _CUDA_VSTD::swap(__buf_, __f.__buf_);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit operator bool() const noexcept
   {
     return !__policy_->__is_null;
   }
 
 #  ifndef _LIBCUDACXX_NO_RTTI
-  _LIBCUDACXX_INLINE_VISIBILITY const type_info& target_type() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI const type_info& target_type() const noexcept
   {
     return *__policy_->__type_info;
   }
 
   template <typename _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY const _Tp* target() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI const _Tp* target() const noexcept
   {
     if (__policy_->__is_null || typeid(_Tp) != *__policy_->__type_info)
     {
@@ -918,7 +919,7 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
   __block_type __f_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __func(__block_type const& __f)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __func(__block_type const& __f)
 #    ifdef _LIBCUDACXX_HAS_OBJC_ARC
       : __f_(__f)
 #    else
@@ -928,7 +929,7 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
 
   // [TODO] add && to save on a retain
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __func(__block_type __f, const _Alloc& /* unused */)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __func(__block_type __f, const _Alloc& /* unused */)
 #    ifdef _LIBCUDACXX_HAS_OBJC_ARC
       : __f_(__f)
 #    else
@@ -996,7 +997,7 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
 } // namespace __function
 
 template <class _Rp, class... _ArgTypes>
-class _LIBCUDACXX_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
+class _CCCL_TYPE_VISIBILITY_DEFAULT function<_Rp(_ArgTypes...)>
     : public __function::__maybe_derive_from_unary_function<_Rp(_ArgTypes...)>
     , public __function::__maybe_derive_from_binary_function<_Rp(_ArgTypes...)>
 {
@@ -1029,8 +1030,8 @@ class _LIBCUDACXX_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
   typedef _Rp result_type;
 
   // construct/copy/destroy:
-  _LIBCUDACXX_INLINE_VISIBILITY function() noexcept {}
-  _LIBCUDACXX_INLINE_VISIBILITY function(nullptr_t) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI function() noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI function(nullptr_t) noexcept {}
   function(const function&);
   function(function&&) noexcept;
   template <class _Fp, class = _EnableIfLValueCallable<_Fp>>
@@ -1038,10 +1039,10 @@ class _LIBCUDACXX_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
 
 #  if _CCCL_STD_VER <= 2014
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY function(allocator_arg_t, const _Alloc&) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI function(allocator_arg_t, const _Alloc&) noexcept
   {}
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY function(allocator_arg_t, const _Alloc&, nullptr_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI function(allocator_arg_t, const _Alloc&, nullptr_t) noexcept
   {}
   template <class _Alloc>
   function(allocator_arg_t, const _Alloc&, const function&);
@@ -1064,14 +1065,14 @@ class _LIBCUDACXX_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
 
 #  if _CCCL_STD_VER <= 2014
   template <class _Fp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY void assign(_Fp&& __f, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI void assign(_Fp&& __f, const _Alloc& __a)
   {
     function(allocator_arg, __a, _CUDA_VSTD::forward<_Fp>(__f)).swap(*this);
   }
 #  endif
 
   // function capacity:
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_EXPLICIT operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_EXPLICIT operator bool() const noexcept
   {
     return static_cast<bool>(__f_);
   }
@@ -1301,32 +1302,31 @@ const _Tp* function<_Rp(_ArgTypes...)>::target() const noexcept
 #  endif // _LIBCUDACXX_NO_RTTI
 
 template <class _Rp, class... _ArgTypes>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator==(const function<_Rp(_ArgTypes...)>& __f, nullptr_t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool operator==(const function<_Rp(_ArgTypes...)>& __f, nullptr_t) noexcept
 {
   return !__f;
 }
 
 template <class _Rp, class... _ArgTypes>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator==(nullptr_t, const function<_Rp(_ArgTypes...)>& __f) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool operator==(nullptr_t, const function<_Rp(_ArgTypes...)>& __f) noexcept
 {
   return !__f;
 }
 
 template <class _Rp, class... _ArgTypes>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(const function<_Rp(_ArgTypes...)>& __f, nullptr_t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool operator!=(const function<_Rp(_ArgTypes...)>& __f, nullptr_t) noexcept
 {
   return (bool) __f;
 }
 
 template <class _Rp, class... _ArgTypes>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(nullptr_t, const function<_Rp(_ArgTypes...)>& __f) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool operator!=(nullptr_t, const function<_Rp(_ArgTypes...)>& __f) noexcept
 {
   return (bool) __f;
 }
 
 template <class _Rp, class... _ArgTypes>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-swap(function<_Rp(_ArgTypes...)>& __x, function<_Rp(_ArgTypes...)>& __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void swap(function<_Rp(_ArgTypes...)>& __x, function<_Rp(_ArgTypes...)>& __y) noexcept
 {
   return __x.swap(__y);
 }
diff --git a/libcudacxx/include/cuda/std/__functional/hash.h b/libcudacxx/include/cuda/std/__functional/hash.h
index 96a817063d..57e8c1099e 100644
--- a/libcudacxx/include/cuda/std/__functional/hash.h
+++ b/libcudacxx/include/cuda/std/__functional/hash.h
@@ -42,7 +42,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _Size __loadword(const void* __p)
+_LIBCUDACXX_HIDE_FROM_ABI _Size __loadword(const void* __p)
 {
   _Size __r;
   std::memcpy(&__r, __p, sizeof(__r));
@@ -278,7 +278,7 @@ struct __scalar_hash;
 template <class _Tp>
 struct __scalar_hash<_Tp, 0> : public __unary_function<_Tp, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(_Tp __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(_Tp __v) const noexcept
   {
     union
     {
@@ -294,7 +294,7 @@ struct __scalar_hash<_Tp, 0> : public __unary_function<_Tp, size_t>
 template <class _Tp>
 struct __scalar_hash<_Tp, 1> : public __unary_function<_Tp, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(_Tp __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(_Tp __v) const noexcept
   {
     union
     {
@@ -309,7 +309,7 @@ struct __scalar_hash<_Tp, 1> : public __unary_function<_Tp, size_t>
 template <class _Tp>
 struct __scalar_hash<_Tp, 2> : public __unary_function<_Tp, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(_Tp __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(_Tp __v) const noexcept
   {
     union
     {
@@ -328,7 +328,7 @@ struct __scalar_hash<_Tp, 2> : public __unary_function<_Tp, size_t>
 template <class _Tp>
 struct __scalar_hash<_Tp, 3> : public __unary_function<_Tp, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(_Tp __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(_Tp __v) const noexcept
   {
     union
     {
@@ -348,7 +348,7 @@ struct __scalar_hash<_Tp, 3> : public __unary_function<_Tp, size_t>
 template <class _Tp>
 struct __scalar_hash<_Tp, 4> : public __unary_function<_Tp, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(_Tp __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(_Tp __v) const noexcept
   {
     union
     {
@@ -372,7 +372,7 @@ struct _PairT
   size_t second;
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline size_t __hash_combine(size_t __lhs, size_t __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI size_t __hash_combine(size_t __lhs, size_t __rhs) noexcept
 {
   typedef __scalar_hash<_PairT> _HashT;
   const _PairT __p = {__lhs, __rhs};
@@ -380,9 +380,9 @@ _LIBCUDACXX_INLINE_VISIBILITY inline size_t __hash_combine(size_t __lhs, size_t
 }
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<_Tp*> : public __unary_function<_Tp*, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<_Tp*> : public __unary_function<_Tp*, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(_Tp* __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(_Tp* __v) const noexcept
   {
     union
     {
@@ -395,36 +395,36 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<_Tp*> : public __unary_function<_Tp*, size_
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<bool> : public __unary_function<bool, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<bool> : public __unary_function<bool, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(bool __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(bool __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<char> : public __unary_function<char, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<char> : public __unary_function<char, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(char __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(char __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<signed char> : public __unary_function<signed char, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<signed char> : public __unary_function<signed char, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(signed char __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(signed char __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<unsigned char> : public __unary_function<unsigned char, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<unsigned char> : public __unary_function<unsigned char, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(unsigned char __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(unsigned char __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
@@ -432,18 +432,18 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<unsigned char> : public __unary_function<un
 
 #  ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<char16_t> : public __unary_function<char16_t, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<char16_t> : public __unary_function<char16_t, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(char16_t __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(char16_t __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<char32_t> : public __unary_function<char32_t, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<char32_t> : public __unary_function<char32_t, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(char32_t __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(char32_t __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
@@ -452,9 +452,9 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<char32_t> : public __unary_function<char32_
 
 #  ifndef _LIBCUDACXX_HAS_NO_WIDE_CHARACTERS
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<wchar_t> : public __unary_function<wchar_t, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<wchar_t> : public __unary_function<wchar_t, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(wchar_t __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(wchar_t __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
@@ -462,83 +462,83 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<wchar_t> : public __unary_function<wchar_t,
 #  endif // _LIBCUDACXX_HAS_NO_WIDE_CHARACTERS
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<short> : public __unary_function<short, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<short> : public __unary_function<short, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(short __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(short __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<unsigned short> : public __unary_function<unsigned short, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<unsigned short> : public __unary_function<unsigned short, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(unsigned short __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(unsigned short __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<int> : public __unary_function<int, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<int> : public __unary_function<int, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(int __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(int __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<unsigned int> : public __unary_function<unsigned int, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<unsigned int> : public __unary_function<unsigned int, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(unsigned int __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(unsigned int __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<long> : public __unary_function<long, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<long> : public __unary_function<long, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(long __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(long __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<unsigned long> : public __unary_function<unsigned long, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<unsigned long> : public __unary_function<unsigned long, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(unsigned long __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(unsigned long __v) const noexcept
   {
     return static_cast<size_t>(__v);
   }
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<long long> : public __scalar_hash<long long>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<long long> : public __scalar_hash<long long>
 {};
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<unsigned long long> : public __scalar_hash<unsigned long long>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<unsigned long long> : public __scalar_hash<unsigned long long>
 {};
 
 #  ifndef _LIBCUDACXX_HAS_NO_INT128
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<__int128_t> : public __scalar_hash<__int128_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<__int128_t> : public __scalar_hash<__int128_t>
 {};
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<__uint128_t> : public __scalar_hash<__uint128_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<__uint128_t> : public __scalar_hash<__uint128_t>
 {};
 
 #  endif
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<float> : public __scalar_hash<float>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<float> : public __scalar_hash<float>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(float __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(float __v) const noexcept
   {
     // -0.0 and 0.0 should return same hash
     if (__v == 0.0f)
@@ -550,9 +550,9 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<float> : public __scalar_hash<float>
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<double> : public __scalar_hash<double>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<double> : public __scalar_hash<double>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(double __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(double __v) const noexcept
   {
     // -0.0 and 0.0 should return same hash
     if (__v == 0.0)
@@ -564,9 +564,9 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<double> : public __scalar_hash<double>
 };
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<long double> : public __scalar_hash<long double>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<long double> : public __scalar_hash<long double>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(long double __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(long double __v) const noexcept
   {
     // -0.0 and 0.0 should return same hash
     if (__v == 0.0L)
@@ -614,16 +614,16 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<long double> : public __scalar_hash<long do
 };
 
 template <class _Tp, bool = is_enum<_Tp>::value>
-struct _LIBCUDACXX_TEMPLATE_VIS __enum_hash : public __unary_function<_Tp, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __enum_hash : public __unary_function<_Tp, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(_Tp __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(_Tp __v) const noexcept
   {
     typedef typename underlying_type<_Tp>::type type;
     return hash<type>()(static_cast<type>(__v));
   }
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS __enum_hash<_Tp, false>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __enum_hash<_Tp, false>
 {
   __enum_hash()                              = delete;
   __enum_hash(__enum_hash const&)            = delete;
@@ -631,15 +631,15 @@ struct _LIBCUDACXX_TEMPLATE_VIS __enum_hash<_Tp, false>
 };
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS hash : public __enum_hash<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash : public __enum_hash<_Tp>
 {};
 
 #  if _CCCL_STD_VER > 2014
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<nullptr_t> : public __unary_function<nullptr_t, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<nullptr_t> : public __unary_function<nullptr_t, size_t>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(nullptr_t) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(nullptr_t) const noexcept
   {
     return 662607004ull;
   }
diff --git a/libcudacxx/include/cuda/std/__functional/identity.h b/libcudacxx/include/cuda/std/__functional/identity.h
index 0c7dead6b5..419d0c9889 100644
--- a/libcudacxx/include/cuda/std/__functional/identity.h
+++ b/libcudacxx/include/cuda/std/__functional/identity.h
@@ -34,7 +34,7 @@ struct __is_identity : false_type
 struct __identity
 {
   template <class _Tp>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp&& operator()(_Tp&& __t) const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& operator()(_Tp&& __t) const noexcept
   {
     return _CUDA_VSTD::forward<_Tp>(__t);
   }
@@ -57,7 +57,7 @@ struct __is_identity<reference_wrapper<const __identity>> : true_type
 struct identity
 {
   template <class _Tp>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp&& operator()(_Tp&& __t) const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& operator()(_Tp&& __t) const noexcept
   {
     return _CUDA_VSTD::forward<_Tp>(__t);
   }
diff --git a/libcudacxx/include/cuda/std/__functional/invoke.h b/libcudacxx/include/cuda/std/__functional/invoke.h
index 0e132158f5..58c298b046 100644
--- a/libcudacxx/include/cuda/std/__functional/invoke.h
+++ b/libcudacxx/include/cuda/std/__functional/invoke.h
@@ -45,7 +45,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 struct __any
 {
-  _LIBCUDACXX_INLINE_VISIBILITY __any(...);
+  _LIBCUDACXX_HIDE_FROM_ABI __any(...);
 };
 
 template <class _MP, bool _IsMemberFunctionPtr, bool _IsMemberObjectPtr>
@@ -319,13 +319,13 @@ using __enable_if_bullet6 =
 // fall back - none of the bullets
 
 template <class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY __nat __invoke(__any, _Args&&... __args);
+_LIBCUDACXX_HIDE_FROM_ABI __nat __invoke(__any, _Args&&... __args);
 
 // bullets 1, 2 and 3
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Fp, class _A0, class... _Args, class = __enable_if_bullet1<_Fp, _A0>>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype((_CUDA_VSTD::declval<_A0>().*_CUDA_VSTD::declval<_Fp>())(
+_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype((_CUDA_VSTD::declval<_A0>().*_CUDA_VSTD::declval<_Fp>())(
   _CUDA_VSTD::declval<_Args>()...))
 __invoke(_Fp&& __f,
          _A0&& __a0,
@@ -336,7 +336,7 @@ __invoke(_Fp&& __f,
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Fp, class _A0, class... _Args, class = __enable_if_bullet2<_Fp, _A0>>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype((_CUDA_VSTD::declval<_A0>().get().*_CUDA_VSTD::declval<_Fp>())(
+_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype((_CUDA_VSTD::declval<_A0>().get().*_CUDA_VSTD::declval<_Fp>())(
   _CUDA_VSTD::declval<_Args>()...))
 __invoke(_Fp&& __f, _A0&& __a0, _Args&&... __args) noexcept(noexcept((__a0.get().*__f)(static_cast<_Args&&>(__args)...)))
 {
@@ -345,7 +345,7 @@ __invoke(_Fp&& __f, _A0&& __a0, _Args&&... __args) noexcept(noexcept((__a0.get()
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Fp, class _A0, class... _Args, class = __enable_if_bullet3<_Fp, _A0>>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(((*_CUDA_VSTD::declval<_A0>()).*_CUDA_VSTD::declval<_Fp>())(
+_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(((*_CUDA_VSTD::declval<_A0>()).*_CUDA_VSTD::declval<_Fp>())(
   _CUDA_VSTD::declval<_Args>()...))
 __invoke(_Fp&& __f,
          _A0&& __a0,
@@ -358,7 +358,7 @@ __invoke(_Fp&& __f,
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Fp, class _A0, class = __enable_if_bullet4<_Fp, _A0>>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(_CUDA_VSTD::declval<_A0>().*_CUDA_VSTD::declval<_Fp>())
+_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::declval<_A0>().*_CUDA_VSTD::declval<_Fp>())
 __invoke(_Fp&& __f, _A0&& __a0) noexcept(noexcept(static_cast<_A0&&>(__a0).*__f))
 {
   return static_cast<_A0&&>(__a0).*__f;
@@ -366,7 +366,7 @@ __invoke(_Fp&& __f, _A0&& __a0) noexcept(noexcept(static_cast<_A0&&>(__a0).*__f)
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Fp, class _A0, class = __enable_if_bullet5<_Fp, _A0>>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(_CUDA_VSTD::declval<_A0>().get().*_CUDA_VSTD::declval<_Fp>())
+_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::declval<_A0>().get().*_CUDA_VSTD::declval<_Fp>())
 __invoke(_Fp&& __f, _A0&& __a0) noexcept(noexcept(__a0.get().*__f))
 {
   return __a0.get().*__f;
@@ -374,7 +374,7 @@ __invoke(_Fp&& __f, _A0&& __a0) noexcept(noexcept(__a0.get().*__f))
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Fp, class _A0, class = __enable_if_bullet6<_Fp, _A0>>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype((*_CUDA_VSTD::declval<_A0>()).*_CUDA_VSTD::declval<_Fp>())
+_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype((*_CUDA_VSTD::declval<_A0>()).*_CUDA_VSTD::declval<_Fp>())
 __invoke(_Fp&& __f, _A0&& __a0) noexcept(noexcept((*static_cast<_A0&&>(__a0)).*__f))
 {
   return (*static_cast<_A0&&>(__a0)).*__f;
@@ -384,7 +384,7 @@ __invoke(_Fp&& __f, _A0&& __a0) noexcept(noexcept((*static_cast<_A0&&>(__a0)).*_
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Fp, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(_CUDA_VSTD::declval<_Fp>()(_CUDA_VSTD::declval<_Args>()...))
+_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(_CUDA_VSTD::declval<_Fp>()(_CUDA_VSTD::declval<_Args>()...))
 __invoke(_Fp&& __f, _Args&&... __args) noexcept(noexcept(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...)))
 {
   return static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...);
@@ -395,12 +395,12 @@ template <class _Ret, class _Fp, class... _Args>
 struct __invokable_r
 {
   template <class _XFp, class... _XArgs>
-  _LIBCUDACXX_INLINE_VISIBILITY static decltype(_CUDA_VSTD::__invoke(
+  _LIBCUDACXX_HIDE_FROM_ABI static decltype(_CUDA_VSTD::__invoke(
     _CUDA_VSTD::declval<_XFp>(), _CUDA_VSTD::declval<_XArgs>()...))
   __try_call(int);
 
   template <class _XFp, class... _XArgs>
-  _LIBCUDACXX_INLINE_VISIBILITY static __nat __try_call(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static __nat __try_call(...);
 
   // FIXME: Check that _Ret, _Fp, and _Args... are all complete types, cv void,
   // or incomplete array types as required by the standard.
@@ -426,7 +426,7 @@ struct __nothrow_invokable_r_imp<true, false, _Ret, _Fp, _Args...>
   typedef __nothrow_invokable_r_imp _ThisT;
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY static void __test_noexcept(_Tp) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static void __test_noexcept(_Tp) noexcept;
 
   static const bool value =
     noexcept(_ThisT::__test_noexcept<_Ret>(_CUDA_VSTD::__invoke(declval<_Fp>(), _CUDA_VSTD::declval<_Args>()...)));
@@ -473,7 +473,7 @@ template <class _Ret, bool = is_void<_Ret>::value>
 struct __invoke_void_return_wrapper
 {
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY static _Ret __call(_Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI static _Ret __call(_Args&&... __args)
   {
     return _CUDA_VSTD::__invoke(_CUDA_VSTD::forward<_Args>(__args)...);
   }
@@ -483,7 +483,7 @@ template <class _Ret>
 struct __invoke_void_return_wrapper<_Ret, true>
 {
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY static void __call(_Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI static void __call(_Args&&... __args)
   {
     _CUDA_VSTD::__invoke(_CUDA_VSTD::forward<_Args>(__args)...);
   }
@@ -494,11 +494,11 @@ struct __invoke_void_return_wrapper<_Ret, true>
 // is_invocable
 
 template <class _Fn, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_invocable : integral_constant<bool, __invokable<_Fn, _Args...>::value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_invocable : integral_constant<bool, __invokable<_Fn, _Args...>::value>
 {};
 
 template <class _Ret, class _Fn, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_invocable_r : integral_constant<bool, __invokable_r<_Ret, _Fn, _Args...>::value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_invocable_r : integral_constant<bool, __invokable_r<_Ret, _Fn, _Args...>::value>
 {};
 
 template <class _Fn, class... _Args>
@@ -510,12 +510,12 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_invocable_r_v = is_invocable_r<_Ret, _F
 // is_nothrow_invocable
 
 template <class _Fn, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_invocable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_invocable
     : integral_constant<bool, __nothrow_invokable<_Fn, _Args...>::value>
 {};
 
 template <class _Ret, class _Fn, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_invocable_r
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_invocable_r
     : integral_constant<bool, __nothrow_invokable_r<_Ret, _Fn, _Args...>::value>
 {};
 
@@ -526,14 +526,14 @@ template <class _Ret, class _Fn, class... _Args>
 _LIBCUDACXX_INLINE_VAR constexpr bool is_nothrow_invocable_r_v = is_nothrow_invocable_r<_Ret, _Fn, _Args...>::value;
 
 template <class _Fn, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS invoke_result : __invoke_of<_Fn, _Args...>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT invoke_result : __invoke_of<_Fn, _Args...>
 {};
 
 template <class _Fn, class... _Args>
 using invoke_result_t = typename invoke_result<_Fn, _Args...>::type;
 
 template <class _Fn, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr invoke_result_t<_Fn, _Args...>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr invoke_result_t<_Fn, _Args...>
 invoke(_Fn&& __f, _Args&&... __args) noexcept(is_nothrow_invocable_v<_Fn, _Args...>)
 {
   return _CUDA_VSTD::__invoke(_CUDA_VSTD::forward<_Fn>(__f), _CUDA_VSTD::forward<_Args>(__args)...);
@@ -541,6 +541,10 @@ invoke(_Fn&& __f, _Args&&... __args) noexcept(is_nothrow_invocable_v<_Fn, _Args.
 
 #endif // _CCCL_STD_VER > 2011
 
+/// The type of intermediate accumulator (according to P2322R6)
+template <typename Invokable, typename InputT, typename InitT = InputT>
+using __accumulator_t = typename decay<typename _CUDA_VSTD::__invoke_of<Invokable, InitT, InputT>::type>::type;
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___FUNCTIONAL_INVOKE_H
diff --git a/libcudacxx/include/cuda/std/__functional/mem_fn.h b/libcudacxx/include/cuda/std/__functional/mem_fn.h
index b3e4a50887..20a55850ea 100644
--- a/libcudacxx/include/cuda/std/__functional/mem_fn.h
+++ b/libcudacxx/include/cuda/std/__functional/mem_fn.h
@@ -39,13 +39,13 @@ class __mem_fn : public __weak_result_type<_Tp>
   type __f_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __mem_fn(type __f) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __mem_fn(type __f) noexcept
       : __f_(__f)
   {}
 
   // invoke
   template <class... _ArgTypes>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 typename __invoke_return<type, _ArgTypes...>::type
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 typename __invoke_return<type, _ArgTypes...>::type
   operator()(_ArgTypes&&... __args) const
   {
     return _CUDA_VSTD::__invoke(__f_, _CUDA_VSTD::forward<_ArgTypes>(__args)...);
@@ -53,7 +53,7 @@ class __mem_fn : public __weak_result_type<_Tp>
 };
 
 template <class _Rp, class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __mem_fn<_Rp _Tp::*> mem_fn(_Rp _Tp::*__pm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __mem_fn<_Rp _Tp::*> mem_fn(_Rp _Tp::*__pm) noexcept
 {
   return __mem_fn<_Rp _Tp::*>(__pm);
 }
diff --git a/libcudacxx/include/cuda/std/__functional/mem_fun_ref.h b/libcudacxx/include/cuda/std/__functional/mem_fun_ref.h
index c096350d06..dacbf793cb 100644
--- a/libcudacxx/include/cuda/std/__functional/mem_fun_ref.h
+++ b/libcudacxx/include/cuda/std/__functional/mem_fun_ref.h
@@ -31,178 +31,177 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 
 template <class _Sp, class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 mem_fun_t : public __unary_function<_Tp*, _Sp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 mem_fun_t : public __unary_function<_Tp*, _Sp>
 {
   _Sp (_Tp::*__p_)();
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit mem_fun_t(_Sp (_Tp::*__p)())
+  _LIBCUDACXX_HIDE_FROM_ABI explicit mem_fun_t(_Sp (_Tp::*__p)())
       : __p_(__p)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Sp operator()(_Tp* __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Sp operator()(_Tp* __p) const
   {
     return (__p->*__p_)();
   }
 };
 
 template <class _Sp, class _Tp, class _Ap>
-class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 mem_fun1_t : public __binary_function<_Tp*, _Ap, _Sp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT
+_LIBCUDACXX_DEPRECATED_IN_CXX11 mem_fun1_t : public __binary_function<_Tp*, _Ap, _Sp>
 {
   _Sp (_Tp::*__p_)(_Ap);
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit mem_fun1_t(_Sp (_Tp::*__p)(_Ap))
+  _LIBCUDACXX_HIDE_FROM_ABI explicit mem_fun1_t(_Sp (_Tp::*__p)(_Ap))
       : __p_(__p)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Sp operator()(_Tp* __p, _Ap __x) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Sp operator()(_Tp* __p, _Ap __x) const
   {
     return (__p->*__p_)(__x);
   }
 };
 
 template <class _Sp, class _Tp>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY mem_fun_t<_Sp, _Tp> mem_fun(_Sp (_Tp::*__f)())
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI mem_fun_t<_Sp, _Tp> mem_fun(_Sp (_Tp::*__f)())
 {
   return mem_fun_t<_Sp, _Tp>(__f);
 }
 
 template <class _Sp, class _Tp, class _Ap>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY mem_fun1_t<_Sp, _Tp, _Ap>
-mem_fun(_Sp (_Tp::*__f)(_Ap))
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI mem_fun1_t<_Sp, _Tp, _Ap> mem_fun(_Sp (_Tp::*__f)(_Ap))
 {
   return mem_fun1_t<_Sp, _Tp, _Ap>(__f);
 }
 
 template <class _Sp, class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 mem_fun_ref_t : public __unary_function<_Tp, _Sp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 mem_fun_ref_t : public __unary_function<_Tp, _Sp>
 {
   _Sp (_Tp::*__p_)();
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit mem_fun_ref_t(_Sp (_Tp::*__p)())
+  _LIBCUDACXX_HIDE_FROM_ABI explicit mem_fun_ref_t(_Sp (_Tp::*__p)())
       : __p_(__p)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Sp operator()(_Tp& __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Sp operator()(_Tp& __p) const
   {
     return (__p.*__p_)();
   }
 };
 
 template <class _Sp, class _Tp, class _Ap>
-class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 mem_fun1_ref_t : public __binary_function<_Tp, _Ap, _Sp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT
+_LIBCUDACXX_DEPRECATED_IN_CXX11 mem_fun1_ref_t : public __binary_function<_Tp, _Ap, _Sp>
 {
   _Sp (_Tp::*__p_)(_Ap);
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit mem_fun1_ref_t(_Sp (_Tp::*__p)(_Ap))
+  _LIBCUDACXX_HIDE_FROM_ABI explicit mem_fun1_ref_t(_Sp (_Tp::*__p)(_Ap))
       : __p_(__p)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Sp operator()(_Tp& __p, _Ap __x) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Sp operator()(_Tp& __p, _Ap __x) const
   {
     return (__p.*__p_)(__x);
   }
 };
 
 template <class _Sp, class _Tp>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY mem_fun_ref_t<_Sp, _Tp>
-mem_fun_ref(_Sp (_Tp::*__f)())
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI mem_fun_ref_t<_Sp, _Tp> mem_fun_ref(_Sp (_Tp::*__f)())
 {
   return mem_fun_ref_t<_Sp, _Tp>(__f);
 }
 
 template <class _Sp, class _Tp, class _Ap>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY mem_fun1_ref_t<_Sp, _Tp, _Ap>
-mem_fun_ref(_Sp (_Tp::*__f)(_Ap))
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI mem_fun1_ref_t<_Sp, _Tp, _Ap> mem_fun_ref(_Sp (_Tp::*__f)(_Ap))
 {
   return mem_fun1_ref_t<_Sp, _Tp, _Ap>(__f);
 }
 
 template <class _Sp, class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS
+class _CCCL_TYPE_VISIBILITY_DEFAULT
 _LIBCUDACXX_DEPRECATED_IN_CXX11 const_mem_fun_t : public __unary_function<const _Tp*, _Sp>
 {
   _Sp (_Tp::*__p_)() const;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit const_mem_fun_t(_Sp (_Tp::*__p)() const)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit const_mem_fun_t(_Sp (_Tp::*__p)() const)
       : __p_(__p)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Sp operator()(const _Tp* __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Sp operator()(const _Tp* __p) const
   {
     return (__p->*__p_)();
   }
 };
 
 template <class _Sp, class _Tp, class _Ap>
-class _LIBCUDACXX_TEMPLATE_VIS
+class _CCCL_TYPE_VISIBILITY_DEFAULT
 _LIBCUDACXX_DEPRECATED_IN_CXX11 const_mem_fun1_t : public __binary_function<const _Tp*, _Ap, _Sp>
 {
   _Sp (_Tp::*__p_)(_Ap) const;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit const_mem_fun1_t(_Sp (_Tp::*__p)(_Ap) const)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit const_mem_fun1_t(_Sp (_Tp::*__p)(_Ap) const)
       : __p_(__p)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Sp operator()(const _Tp* __p, _Ap __x) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Sp operator()(const _Tp* __p, _Ap __x) const
   {
     return (__p->*__p_)(__x);
   }
 };
 
 template <class _Sp, class _Tp>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY const_mem_fun_t<_Sp, _Tp>
-mem_fun(_Sp (_Tp::*__f)() const)
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI const_mem_fun_t<_Sp, _Tp> mem_fun(_Sp (_Tp::*__f)() const)
 {
   return const_mem_fun_t<_Sp, _Tp>(__f);
 }
 
 template <class _Sp, class _Tp, class _Ap>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY const_mem_fun1_t<_Sp, _Tp, _Ap>
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI const_mem_fun1_t<_Sp, _Tp, _Ap>
 mem_fun(_Sp (_Tp::*__f)(_Ap) const)
 {
   return const_mem_fun1_t<_Sp, _Tp, _Ap>(__f);
 }
 
 template <class _Sp, class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 const_mem_fun_ref_t : public __unary_function<_Tp, _Sp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT
+_LIBCUDACXX_DEPRECATED_IN_CXX11 const_mem_fun_ref_t : public __unary_function<_Tp, _Sp>
 {
   _Sp (_Tp::*__p_)() const;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit const_mem_fun_ref_t(_Sp (_Tp::*__p)() const)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit const_mem_fun_ref_t(_Sp (_Tp::*__p)() const)
       : __p_(__p)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Sp operator()(const _Tp& __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Sp operator()(const _Tp& __p) const
   {
     return (__p.*__p_)();
   }
 };
 
 template <class _Sp, class _Tp, class _Ap>
-class _LIBCUDACXX_TEMPLATE_VIS
+class _CCCL_TYPE_VISIBILITY_DEFAULT
 _LIBCUDACXX_DEPRECATED_IN_CXX11 const_mem_fun1_ref_t : public __binary_function<_Tp, _Ap, _Sp>
 {
   _Sp (_Tp::*__p_)(_Ap) const;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit const_mem_fun1_ref_t(_Sp (_Tp::*__p)(_Ap) const)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit const_mem_fun1_ref_t(_Sp (_Tp::*__p)(_Ap) const)
       : __p_(__p)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Sp operator()(const _Tp& __p, _Ap __x) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Sp operator()(const _Tp& __p, _Ap __x) const
   {
     return (__p.*__p_)(__x);
   }
 };
 
 template <class _Sp, class _Tp>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY const_mem_fun_ref_t<_Sp, _Tp>
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI const_mem_fun_ref_t<_Sp, _Tp>
 mem_fun_ref(_Sp (_Tp::*__f)() const)
 {
   return const_mem_fun_ref_t<_Sp, _Tp>(__f);
 }
 
 template <class _Sp, class _Tp, class _Ap>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY const_mem_fun1_ref_t<_Sp, _Tp, _Ap>
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI const_mem_fun1_ref_t<_Sp, _Tp, _Ap>
 mem_fun_ref(_Sp (_Tp::*__f)(_Ap) const)
 {
   return const_mem_fun1_ref_t<_Sp, _Tp, _Ap>(__f);
diff --git a/libcudacxx/include/cuda/std/__functional/not_fn.h b/libcudacxx/include/cuda/std/__functional/not_fn.h
index deec99e441..eab9770b20 100644
--- a/libcudacxx/include/cuda/std/__functional/not_fn.h
+++ b/libcudacxx/include/cuda/std/__functional/not_fn.h
@@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 struct __not_fn_op
 {
   template <class... _Args>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 auto operator()(_Args&&... __args) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 auto operator()(_Args&&... __args) const
     noexcept(noexcept(!_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Args>(__args)...)))
       -> decltype(!_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Args>(__args)...))
   {
@@ -49,11 +49,11 @@ struct __not_fn_t : __perfect_forward<__not_fn_op, _Fn>
 {
   using __base = __perfect_forward<__not_fn_op, _Fn>;
 #  if defined(_CCCL_COMPILER_NVRTC) // nvbug 3961621
-  constexpr __not_fn_t() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __not_fn_t() noexcept = default;
 
   _LIBCUDACXX_TEMPLATE(class _OrigFn)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_same, _Fn, __decay_t<_OrigFn>))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __not_fn_t(_OrigFn&& __fn) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __not_fn_t(_OrigFn&& __fn) noexcept(
     noexcept(__base(_CUDA_VSTD::declval<_OrigFn>())))
       : __base(_CUDA_VSTD::forward<_OrigFn>(__fn))
   {}
@@ -63,7 +63,7 @@ struct __not_fn_t : __perfect_forward<__not_fn_op, _Fn>
 };
 
 template <class _Fn, class = enable_if_t<is_constructible_v<decay_t<_Fn>, _Fn> && is_move_constructible_v<decay_t<_Fn>>>>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 auto not_fn(_Fn&& __f)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 auto not_fn(_Fn&& __f)
 {
   return __not_fn_t<decay_t<_Fn>>(_CUDA_VSTD::forward<_Fn>(__f));
 }
diff --git a/libcudacxx/include/cuda/std/__functional/operations.h b/libcudacxx/include/cuda/std/__functional/operations.h
index 9b58d14ef8..3c24c3db2a 100644
--- a/libcudacxx/include/cuda/std/__functional/operations.h
+++ b/libcudacxx/include/cuda/std/__functional/operations.h
@@ -30,11 +30,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // Arithmetic operations
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS plus : __binary_function<_Tp, _Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT plus : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x + __y;
   }
@@ -42,11 +42,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS plus : __binary_function<_Tp, _Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(plus);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS plus<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT plus<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) + _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) + _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -56,11 +56,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS plus<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS minus : __binary_function<_Tp, _Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT minus : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x - __y;
   }
@@ -68,11 +68,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS minus : __binary_function<_Tp, _Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(minus);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS minus<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT minus<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) - _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) - _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -82,11 +82,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS minus<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS multiplies : __binary_function<_Tp, _Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT multiplies : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x * __y;
   }
@@ -94,11 +94,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS multiplies : __binary_function<_Tp, _Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(multiplies);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS multiplies<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT multiplies<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) * _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) * _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -108,11 +108,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS multiplies<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS divides : __binary_function<_Tp, _Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT divides : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x / __y;
   }
@@ -120,11 +120,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS divides : __binary_function<_Tp, _Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(divides);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS divides<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT divides<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) / _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) / _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -134,11 +134,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS divides<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS modulus : __binary_function<_Tp, _Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT modulus : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x % __y;
   }
@@ -146,11 +146,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS modulus : __binary_function<_Tp, _Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(modulus);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS modulus<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT modulus<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) % _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) % _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -160,11 +160,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS modulus<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS negate : __unary_function<_Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT negate : __unary_function<_Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x) const
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x) const
   {
     return -__x;
   }
@@ -172,11 +172,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS negate : __unary_function<_Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(negate);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS negate<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT negate<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _Tp>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_Tp&& __x) const
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_Tp&& __x) const
     noexcept(noexcept(-_CUDA_VSTD::forward<_Tp>(__x))) -> decltype(-_CUDA_VSTD::forward<_Tp>(__x))
   {
     return -_CUDA_VSTD::forward<_Tp>(__x);
@@ -187,11 +187,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS negate<void>
 // Bitwise operations
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS bit_and : __binary_function<_Tp, _Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_and : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x & __y;
   }
@@ -199,11 +199,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_and : __binary_function<_Tp, _Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(bit_and);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS bit_and<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_and<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) & _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) & _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -213,10 +213,10 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_and<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS bit_not : __unary_function<_Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_not : __unary_function<_Tp, _Tp>
 {
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x) const
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x) const
   {
     return ~__x;
   }
@@ -224,11 +224,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_not : __unary_function<_Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(bit_not);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS bit_not<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_not<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _Tp>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_Tp&& __x) const
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_Tp&& __x) const
     noexcept(noexcept(~_CUDA_VSTD::forward<_Tp>(__x))) -> decltype(~_CUDA_VSTD::forward<_Tp>(__x))
   {
     return ~_CUDA_VSTD::forward<_Tp>(__x);
@@ -237,11 +237,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_not<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS bit_or : __binary_function<_Tp, _Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_or : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x | __y;
   }
@@ -249,11 +249,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_or : __binary_function<_Tp, _Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(bit_or);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS bit_or<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_or<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) | _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) | _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -263,11 +263,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_or<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS bit_xor : __binary_function<_Tp, _Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_xor : __binary_function<_Tp, _Tp, _Tp>
 {
   typedef _Tp __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY _Tp operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x ^ __y;
   }
@@ -275,11 +275,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_xor : __binary_function<_Tp, _Tp, _Tp>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(bit_xor);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS bit_xor<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT bit_xor<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) ^ _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) ^ _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -291,11 +291,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS bit_xor<void>
 // Comparison operations
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS equal_to : __binary_function<_Tp, _Tp, bool>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT equal_to : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x == __y;
   }
@@ -303,11 +303,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS equal_to : __binary_function<_Tp, _Tp, bool>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(equal_to);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS equal_to<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT equal_to<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) == _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) == _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -317,11 +317,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS equal_to<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS not_equal_to : __binary_function<_Tp, _Tp, bool>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT not_equal_to : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x != __y;
   }
@@ -329,11 +329,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS not_equal_to : __binary_function<_Tp, _Tp, bool>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(not_equal_to);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS not_equal_to<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT not_equal_to<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) != _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) != _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -343,11 +343,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS not_equal_to<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS less : __binary_function<_Tp, _Tp, bool>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT less : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x < __y;
   }
@@ -355,11 +355,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS less : __binary_function<_Tp, _Tp, bool>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(less);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS less<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT less<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) < _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) < _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -369,11 +369,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS less<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS less_equal : __binary_function<_Tp, _Tp, bool>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT less_equal : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x <= __y;
   }
@@ -381,11 +381,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS less_equal : __binary_function<_Tp, _Tp, bool>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(less_equal);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS less_equal<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT less_equal<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) <= _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) <= _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -395,11 +395,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS less_equal<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS greater_equal : __binary_function<_Tp, _Tp, bool>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT greater_equal : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x >= __y;
   }
@@ -407,11 +407,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS greater_equal : __binary_function<_Tp, _Tp, bool
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(greater_equal);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS greater_equal<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT greater_equal<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) >= _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) >= _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -421,11 +421,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS greater_equal<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS greater : __binary_function<_Tp, _Tp, bool>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT greater : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x > __y;
   }
@@ -433,11 +433,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS greater : __binary_function<_Tp, _Tp, bool>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(greater);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS greater<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT greater<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) > _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) > _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -449,11 +449,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS greater<void>
 // Logical operations
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS logical_and : __binary_function<_Tp, _Tp, bool>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_and : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x && __y;
   }
@@ -461,11 +461,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_and : __binary_function<_Tp, _Tp, bool>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(logical_and);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS logical_and<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_and<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) && _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) && _CUDA_VSTD::forward<_T2>(__u))
   {
@@ -475,11 +475,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_and<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS logical_not : __unary_function<_Tp, bool>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_not : __unary_function<_Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _Tp& __x) const
   {
     return !__x;
   }
@@ -487,11 +487,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_not : __unary_function<_Tp, bool>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(logical_not);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS logical_not<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_not<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _Tp>
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_Tp&& __x) const
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_Tp&& __x) const
     noexcept(noexcept(!_CUDA_VSTD::forward<_Tp>(__x))) -> decltype(!_CUDA_VSTD::forward<_Tp>(__x))
   {
     return !_CUDA_VSTD::forward<_Tp>(__x);
@@ -500,11 +500,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_not<void>
 };
 
 template <class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS logical_or : __binary_function<_Tp, _Tp, bool>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_or : __binary_function<_Tp, _Tp, bool>
 {
   typedef bool __result_type; // used by valarray
   _CCCL_EXEC_CHECK_DISABLE
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _Tp& __x, const _Tp& __y) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const
   {
     return __x || __y;
   }
@@ -512,11 +512,11 @@ struct _LIBCUDACXX_TEMPLATE_VIS logical_or : __binary_function<_Tp, _Tp, bool>
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(logical_or);
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS logical_or<void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT logical_or<void>
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _T1, class _T2>
-  constexpr _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const
+  constexpr _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_T1&& __t, _T2&& __u) const
     noexcept(noexcept(_CUDA_VSTD::forward<_T1>(__t) || _CUDA_VSTD::forward<_T2>(__u)))
       -> decltype(_CUDA_VSTD::forward<_T1>(__t) || _CUDA_VSTD::forward<_T2>(__u))
   {
diff --git a/libcudacxx/include/cuda/std/__functional/perfect_forward.h b/libcudacxx/include/cuda/std/__functional/perfect_forward.h
index 526fafe7f5..dc281d78e3 100644
--- a/libcudacxx/include/cuda/std/__functional/perfect_forward.h
+++ b/libcudacxx/include/cuda/std/__functional/perfect_forward.h
@@ -48,20 +48,20 @@ struct __perfect_forward_impl<_Op, index_sequence<_Idx...>, _BoundArgs...>
 public:
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(is_constructible_v<tuple<_BoundArgs...>, _Args&&...>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __perfect_forward_impl(
-    _Args&&... __bound_args) noexcept(is_nothrow_constructible_v<tuple<_BoundArgs...>, _Args&&...>)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __perfect_forward_impl(_Args&&... __bound_args) noexcept(
+    is_nothrow_constructible_v<tuple<_BoundArgs...>, _Args&&...>)
       : __bound_args_(_CUDA_VSTD::forward<_Args>(__bound_args)...)
   {}
 
-  __perfect_forward_impl(__perfect_forward_impl const&) = default;
-  __perfect_forward_impl(__perfect_forward_impl&&)      = default;
+  _CCCL_HIDE_FROM_ABI __perfect_forward_impl(__perfect_forward_impl const&) = default;
+  _CCCL_HIDE_FROM_ABI __perfect_forward_impl(__perfect_forward_impl&&)      = default;
 
-  __perfect_forward_impl& operator=(__perfect_forward_impl const&) = default;
-  __perfect_forward_impl& operator=(__perfect_forward_impl&&)      = default;
+  _CCCL_HIDE_FROM_ABI __perfect_forward_impl& operator=(__perfect_forward_impl const&) = default;
+  _CCCL_HIDE_FROM_ABI __perfect_forward_impl& operator=(__perfect_forward_impl&&)      = default;
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(is_invocable_v<_Op, _BoundArgs&..., _Args...>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Args&&... __args) & noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Args&&... __args) & noexcept(
     noexcept(_Op()(_CUDA_VSTD::get<_Idx>(__bound_args_)..., _CUDA_VSTD::forward<_Args>(__args)...)))
     -> decltype(_Op()(_CUDA_VSTD::get<_Idx>(__bound_args_)..., _CUDA_VSTD::forward<_Args>(__args)...))
   {
@@ -70,11 +70,11 @@ struct __perfect_forward_impl<_Op, index_sequence<_Idx...>, _BoundArgs...>
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES((!is_invocable_v<_Op, _BoundArgs&..., _Args...>) )
-  _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_Args&&...) & = delete;
+  _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_Args&&...) & = delete;
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(is_invocable_v<_Op, _BoundArgs const&..., _Args...>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Args&&... __args) const& noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Args&&... __args) const& noexcept(
     noexcept(_Op()(_CUDA_VSTD::get<_Idx>(__bound_args_)..., _CUDA_VSTD::forward<_Args>(__args)...)))
     -> decltype(_Op()(_CUDA_VSTD::get<_Idx>(__bound_args_)..., _CUDA_VSTD::forward<_Args>(__args)...))
   {
@@ -83,11 +83,11 @@ struct __perfect_forward_impl<_Op, index_sequence<_Idx...>, _BoundArgs...>
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES((!is_invocable_v<_Op, _BoundArgs const&..., _Args...>) )
-  _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_Args&&...) const& = delete;
+  _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_Args&&...) const& = delete;
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(is_invocable_v<_Op, _BoundArgs..., _Args...>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Args&&... __args) && noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Args&&... __args) && noexcept(
     noexcept(_Op()(_CUDA_VSTD::get<_Idx>(_CUDA_VSTD::move(__bound_args_))..., _CUDA_VSTD::forward<_Args>(__args)...)))
     -> decltype(_Op()(_CUDA_VSTD::get<_Idx>(_CUDA_VSTD::move(__bound_args_))..., _CUDA_VSTD::forward<_Args>(__args)...))
   {
@@ -96,11 +96,11 @@ struct __perfect_forward_impl<_Op, index_sequence<_Idx...>, _BoundArgs...>
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES((!is_invocable_v<_Op, _BoundArgs..., _Args...>) )
-  _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_Args&&...) && = delete;
+  _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_Args&&...) && = delete;
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES(is_invocable_v<_Op, _BoundArgs const..., _Args...>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Args&&... __args) const&& noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Args&&... __args) const&& noexcept(
     noexcept(_Op()(_CUDA_VSTD::get<_Idx>(_CUDA_VSTD::move(__bound_args_))..., _CUDA_VSTD::forward<_Args>(__args)...)))
     -> decltype(_Op()(_CUDA_VSTD::get<_Idx>(_CUDA_VSTD::move(__bound_args_))..., _CUDA_VSTD::forward<_Args>(__args)...))
   {
@@ -109,7 +109,7 @@ struct __perfect_forward_impl<_Op, index_sequence<_Idx...>, _BoundArgs...>
 
   _LIBCUDACXX_TEMPLATE(class... _Args)
   _LIBCUDACXX_REQUIRES((!is_invocable_v<_Op, _BoundArgs const..., _Args...>) )
-  _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_Args&&...) const&& = delete;
+  _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_Args&&...) const&& = delete;
 };
 
 // __perfect_forward implements a perfect-forwarding call wrapper as explained in [func.require].
diff --git a/libcudacxx/include/cuda/std/__functional/pointer_to_binary_function.h b/libcudacxx/include/cuda/std/__functional/pointer_to_binary_function.h
index 09838d9b82..e7aeeac189 100644
--- a/libcudacxx/include/cuda/std/__functional/pointer_to_binary_function.h
+++ b/libcudacxx/include/cuda/std/__functional/pointer_to_binary_function.h
@@ -30,23 +30,23 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 
 template <class _Arg1, class _Arg2, class _Result>
-class _LIBCUDACXX_TEMPLATE_VIS
+class _CCCL_TYPE_VISIBILITY_DEFAULT
 _LIBCUDACXX_DEPRECATED_IN_CXX11 pointer_to_binary_function : public __binary_function<_Arg1, _Arg2, _Result>
 {
   _Result (*__f_)(_Arg1, _Arg2);
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit pointer_to_binary_function(_Result (*__f)(_Arg1, _Arg2))
+  _LIBCUDACXX_HIDE_FROM_ABI explicit pointer_to_binary_function(_Result (*__f)(_Arg1, _Arg2))
       : __f_(__f)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Result operator()(_Arg1 __x, _Arg2 __y) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Result operator()(_Arg1 __x, _Arg2 __y) const
   {
     return __f_(__x, __y);
   }
 };
 
 template <class _Arg1, class _Arg2, class _Result>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY pointer_to_binary_function<_Arg1, _Arg2, _Result>
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI pointer_to_binary_function<_Arg1, _Arg2, _Result>
 ptr_fun(_Result (*__f)(_Arg1, _Arg2))
 {
   return pointer_to_binary_function<_Arg1, _Arg2, _Result>(__f);
diff --git a/libcudacxx/include/cuda/std/__functional/pointer_to_unary_function.h b/libcudacxx/include/cuda/std/__functional/pointer_to_unary_function.h
index c4d1caa8f0..4dd44ce7e9 100644
--- a/libcudacxx/include/cuda/std/__functional/pointer_to_unary_function.h
+++ b/libcudacxx/include/cuda/std/__functional/pointer_to_unary_function.h
@@ -30,23 +30,23 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 
 template <class _Arg, class _Result>
-class _LIBCUDACXX_TEMPLATE_VIS
+class _CCCL_TYPE_VISIBILITY_DEFAULT
 _LIBCUDACXX_DEPRECATED_IN_CXX11 pointer_to_unary_function : public __unary_function<_Arg, _Result>
 {
   _Result (*__f_)(_Arg);
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit pointer_to_unary_function(_Result (*__f)(_Arg))
+  _LIBCUDACXX_HIDE_FROM_ABI explicit pointer_to_unary_function(_Result (*__f)(_Arg))
       : __f_(__f)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _Result operator()(_Arg __x) const
+  _LIBCUDACXX_HIDE_FROM_ABI _Result operator()(_Arg __x) const
   {
     return __f_(__x);
   }
 };
 
 template <class _Arg, class _Result>
-_LIBCUDACXX_DEPRECATED_IN_CXX11 inline _LIBCUDACXX_INLINE_VISIBILITY pointer_to_unary_function<_Arg, _Result>
+_LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_HIDE_FROM_ABI pointer_to_unary_function<_Arg, _Result>
 ptr_fun(_Result (*__f)(_Arg))
 {
   return pointer_to_unary_function<_Arg, _Result>(__f);
diff --git a/libcudacxx/include/cuda/std/__functional/ranges_operations.h b/libcudacxx/include/cuda/std/__functional/ranges_operations.h
index 38f7b33387..eea6b57f7d 100644
--- a/libcudacxx/include/cuda/std/__functional/ranges_operations.h
+++ b/libcudacxx/include/cuda/std/__functional/ranges_operations.h
@@ -33,7 +33,7 @@ struct equal_to
 {
   _LIBCUDACXX_TEMPLATE(class _Tp, class _Up)
   _LIBCUDACXX_REQUIRES(equality_comparable_with<_Tp, _Up>)
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Tp&& __t, _Up&& __u) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Tp&& __t, _Up&& __u) const
     noexcept(noexcept(bool(_CUDA_VSTD::forward<_Tp>(__t) == _CUDA_VSTD::forward<_Up>(__u))))
   {
     return _CUDA_VSTD::forward<_Tp>(__t) == _CUDA_VSTD::forward<_Up>(__u);
@@ -46,7 +46,7 @@ struct not_equal_to
 {
   _LIBCUDACXX_TEMPLATE(class _Tp, class _Up)
   _LIBCUDACXX_REQUIRES(equality_comparable_with<_Tp, _Up>)
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Tp&& __t, _Up&& __u) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Tp&& __t, _Up&& __u) const
     noexcept(noexcept(bool(!(_CUDA_VSTD::forward<_Tp>(__t) == _CUDA_VSTD::forward<_Up>(__u)))))
   {
     return !(_CUDA_VSTD::forward<_Tp>(__t) == _CUDA_VSTD::forward<_Up>(__u));
@@ -59,7 +59,7 @@ struct less
 {
   _LIBCUDACXX_TEMPLATE(class _Tp, class _Up)
   _LIBCUDACXX_REQUIRES(totally_ordered_with<_Tp, _Up>)
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Tp&& __t, _Up&& __u) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Tp&& __t, _Up&& __u) const
     noexcept(noexcept(bool(_CUDA_VSTD::forward<_Tp>(__t) < _CUDA_VSTD::forward<_Up>(__u))))
   {
     return _CUDA_VSTD::forward<_Tp>(__t) < _CUDA_VSTD::forward<_Up>(__u);
@@ -72,7 +72,7 @@ struct less_equal
 {
   _LIBCUDACXX_TEMPLATE(class _Tp, class _Up)
   _LIBCUDACXX_REQUIRES(totally_ordered_with<_Tp, _Up>)
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Tp&& __t, _Up&& __u) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Tp&& __t, _Up&& __u) const
     noexcept(noexcept(bool(!(_CUDA_VSTD::forward<_Up>(__u) < _CUDA_VSTD::forward<_Tp>(__t)))))
   {
     return !(_CUDA_VSTD::forward<_Up>(__u) < _CUDA_VSTD::forward<_Tp>(__t));
@@ -85,7 +85,7 @@ struct greater
 {
   _LIBCUDACXX_TEMPLATE(class _Tp, class _Up)
   _LIBCUDACXX_REQUIRES(totally_ordered_with<_Tp, _Up>)
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Tp&& __t, _Up&& __u) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Tp&& __t, _Up&& __u) const
     noexcept(noexcept(bool(_CUDA_VSTD::forward<_Up>(__u) < _CUDA_VSTD::forward<_Tp>(__t))))
   {
     return _CUDA_VSTD::forward<_Up>(__u) < _CUDA_VSTD::forward<_Tp>(__t);
@@ -98,7 +98,7 @@ struct greater_equal
 {
   _LIBCUDACXX_TEMPLATE(class _Tp, class _Up)
   _LIBCUDACXX_REQUIRES(totally_ordered_with<_Tp, _Up>)
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Tp&& __t, _Up&& __u) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Tp&& __t, _Up&& __u) const
     noexcept(noexcept(bool(!(_CUDA_VSTD::forward<_Tp>(__t) < _CUDA_VSTD::forward<_Up>(__u)))))
   {
     return !(_CUDA_VSTD::forward<_Tp>(__t) < _CUDA_VSTD::forward<_Up>(__u));
diff --git a/libcudacxx/include/cuda/std/__functional/reference_wrapper.h b/libcudacxx/include/cuda/std/__functional/reference_wrapper.h
index 3ecbcbd803..ca7cc93a2e 100644
--- a/libcudacxx/include/cuda/std/__functional/reference_wrapper.h
+++ b/libcudacxx/include/cuda/std/__functional/reference_wrapper.h
@@ -31,7 +31,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS reference_wrapper : public __weak_result_type<_Tp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT reference_wrapper : public __weak_result_type<_Tp>
 {
 public:
   // types
@@ -40,32 +40,31 @@ class _LIBCUDACXX_TEMPLATE_VIS reference_wrapper : public __weak_result_type<_Tp
 private:
   type* __f_;
 
-  static _LIBCUDACXX_INLINE_VISIBILITY void __fun(_Tp&) noexcept;
+  static _LIBCUDACXX_HIDE_FROM_ABI void __fun(_Tp&) noexcept;
   static void __fun(_Tp&&) = delete;
 
 public:
   template <class _Up,
             class = __enable_if_t<!__is_same_uncvref<_Up, reference_wrapper>::value, decltype(__fun(declval<_Up>()))>>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  reference_wrapper(_Up&& __u) noexcept(noexcept(__fun(declval<_Up>())))
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 reference_wrapper(_Up&& __u) noexcept(noexcept(__fun(declval<_Up>())))
   {
     type& __f = static_cast<_Up&&>(__u);
     __f_      = _CUDA_VSTD::addressof(__f);
   }
 
   // access
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 operator type&() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 operator type&() const noexcept
   {
     return *__f_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 type& get() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 type& get() const noexcept
   {
     return *__f_;
   }
 
   // invoke
   template <class... _ArgTypes>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 typename __invoke_of<type&, _ArgTypes...>::type
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 typename __invoke_of<type&, _ArgTypes...>::type
   operator()(_ArgTypes&&... __args) const
 #if _CCCL_STD_VER > 2011
     // Since is_nothrow_invocable requires C++11 LWG3764 is not backported
@@ -83,27 +82,25 @@ _CCCL_HOST_DEVICE reference_wrapper(_Tp&) -> reference_wrapper<_Tp>;
 #endif
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 reference_wrapper<_Tp> ref(_Tp& __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 reference_wrapper<_Tp> ref(_Tp& __t) noexcept
 {
   return reference_wrapper<_Tp>(__t);
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 reference_wrapper<_Tp>
-ref(reference_wrapper<_Tp> __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 reference_wrapper<_Tp> ref(reference_wrapper<_Tp> __t) noexcept
 {
   return __t;
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 reference_wrapper<const _Tp> cref(const _Tp& __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 reference_wrapper<const _Tp> cref(const _Tp& __t) noexcept
 {
   return reference_wrapper<const _Tp>(__t);
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 reference_wrapper<const _Tp>
-cref(reference_wrapper<_Tp> __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 reference_wrapper<const _Tp> cref(reference_wrapper<_Tp> __t) noexcept
 {
   return __t;
 }
diff --git a/libcudacxx/include/cuda/std/__functional/unary_function.h b/libcudacxx/include/cuda/std/__functional/unary_function.h
index 8d4121cb73..c350975357 100644
--- a/libcudacxx/include/cuda/std/__functional/unary_function.h
+++ b/libcudacxx/include/cuda/std/__functional/unary_function.h
@@ -24,7 +24,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if _CCCL_STD_VER <= 2014 || defined(_LIBCUDACXX_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION)
 
 template <class _Arg, class _Result>
-struct _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX11 unary_function
+struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX11 unary_function
 {
   typedef _Arg argument_type;
   typedef _Result result_type;
diff --git a/libcudacxx/include/cuda/std/__functional/unary_negate.h b/libcudacxx/include/cuda/std/__functional/unary_negate.h
index 47b8291300..739ef69633 100644
--- a/libcudacxx/include/cuda/std/__functional/unary_negate.h
+++ b/libcudacxx/include/cuda/std/__functional/unary_negate.h
@@ -30,18 +30,17 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 
 template <class _Predicate>
-class _LIBCUDACXX_TEMPLATE_VIS
+class _CCCL_TYPE_VISIBILITY_DEFAULT
 _LIBCUDACXX_DEPRECATED_IN_CXX17 unary_negate : public __unary_function<typename _Predicate::argument_type, bool>
 {
   _Predicate __pred_;
 
 public:
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY explicit unary_negate(const _Predicate& __pred)
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI explicit unary_negate(const _Predicate& __pred)
       : __pred_(__pred)
   {}
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY bool
-  operator()(const typename _Predicate::argument_type& __x) const
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const typename _Predicate::argument_type& __x) const
   {
     return !__pred_(__x);
   }
@@ -49,7 +48,7 @@ _LIBCUDACXX_DEPRECATED_IN_CXX17 unary_negate : public __unary_function<typename
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Predicate>
-_LIBCUDACXX_DEPRECATED_IN_CXX17 inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY unary_negate<_Predicate>
+_LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unary_negate<_Predicate>
 not1(const _Predicate& __pred)
 {
   return unary_negate<_Predicate>(__pred);
diff --git a/libcudacxx/include/cuda/std/__functional/weak_result_type.h b/libcudacxx/include/cuda/std/__functional/weak_result_type.h
index 3781bbcaf8..2b8227834f 100644
--- a/libcudacxx/include/cuda/std/__functional/weak_result_type.h
+++ b/libcudacxx/include/cuda/std/__functional/weak_result_type.h
@@ -34,9 +34,9 @@ struct __has_result_type
 {
 private:
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static false_type __test(...);
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static true_type __test(typename _Up::result_type* = 0);
+  _LIBCUDACXX_HIDE_FROM_ABI static true_type __test(typename _Up::result_type* = 0);
 
 public:
   static const bool value = decltype(__test<_Tp>(0))::value;
@@ -53,9 +53,9 @@ struct __derives_from_unary_function
     char __lx;
     char __lxx;
   };
-  static _LIBCUDACXX_INLINE_VISIBILITY __two __test(...);
+  static _LIBCUDACXX_HIDE_FROM_ABI __two __test(...);
   template <class _Ap, class _Rp>
-  static _LIBCUDACXX_INLINE_VISIBILITY __unary_function<_Ap, _Rp> __test(const volatile __unary_function<_Ap, _Rp>*);
+  static _LIBCUDACXX_HIDE_FROM_ABI __unary_function<_Ap, _Rp> __test(const volatile __unary_function<_Ap, _Rp>*);
 
 public:
   static const bool value = !is_same<decltype(__test((_Tp*) 0)), __two>::value;
@@ -71,9 +71,9 @@ struct __derives_from_binary_function
     char __lx;
     char __lxx;
   };
-  static __two _LIBCUDACXX_INLINE_VISIBILITY __test(...);
+  static __two _LIBCUDACXX_HIDE_FROM_ABI __test(...);
   template <class _A1, class _A2, class _Rp>
-  static _LIBCUDACXX_INLINE_VISIBILITY __binary_function<_A1, _A2, _Rp>
+  static _LIBCUDACXX_HIDE_FROM_ABI __binary_function<_A1, _A2, _Rp>
   __test(const volatile __binary_function<_A1, _A2, _Rp>*);
 
 public:
diff --git a/libcudacxx/include/cuda/std/__fwd/array.h b/libcudacxx/include/cuda/std/__fwd/array.h
index e0535ec976..c0cda0e912 100644
--- a/libcudacxx/include/cuda/std/__fwd/array.h
+++ b/libcudacxx/include/cuda/std/__fwd/array.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, size_t _Size>
-struct _LIBCUDACXX_TEMPLATE_VIS array;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT array;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__fwd/get.h b/libcudacxx/include/cuda/std/__fwd/get.h
index 63eab4ea11..9280f9d45d 100644
--- a/libcudacxx/include/cuda/std/__fwd/get.h
+++ b/libcudacxx/include/cuda/std/__fwd/get.h
@@ -31,47 +31,44 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <size_t _Ip, class... _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple<_Tp...>>& get(tuple<_Tp...>&) noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple<_Tp...>>& get(tuple<_Tp...>&) noexcept;
 
 template <size_t _Ip, class... _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple<_Tp...>>&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple<_Tp...>>&
 get(const tuple<_Tp...>&) noexcept;
 
 template <size_t _Ip, class... _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple<_Tp...>>&&
-get(tuple<_Tp...>&&) noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple<_Tp...>>&& get(tuple<_Tp...>&&) noexcept;
 
 template <size_t _Ip, class... _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple<_Tp...>>&&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple<_Tp...>>&&
 get(const tuple<_Tp...>&&) noexcept;
 
 template <size_t _Ip, class _T1, class _T2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, pair<_T1, _T2>>&
-get(pair<_T1, _T2>&) noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, pair<_T1, _T2>>& get(pair<_T1, _T2>&) noexcept;
 
 template <size_t _Ip, class _T1, class _T2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, pair<_T1, _T2>>&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, pair<_T1, _T2>>&
 get(const pair<_T1, _T2>&) noexcept;
 
 template <size_t _Ip, class _T1, class _T2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, pair<_T1, _T2>>&&
-get(pair<_T1, _T2>&&) noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, pair<_T1, _T2>>&& get(pair<_T1, _T2>&&) noexcept;
 
 template <size_t _Ip, class _T1, class _T2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, pair<_T1, _T2>>&&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, pair<_T1, _T2>>&&
 get(const pair<_T1, _T2>&&) noexcept;
 
 template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp& get(array<_Tp, _Size>&) noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp& get(array<_Tp, _Size>&) noexcept;
 
 template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp& get(const array<_Tp, _Size>&) noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp& get(const array<_Tp, _Size>&) noexcept;
 
 template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp&& get(array<_Tp, _Size>&&) noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp&& get(array<_Tp, _Size>&&) noexcept;
 
 template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) noexcept;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
@@ -89,8 +86,7 @@ template <size_t _Index,
           subrange_kind _Kind,
           enable_if_t<((_Index == 0) && copyable<_Iter>) || (_Index == 1), int> = 0>
 #  endif // _CCCL_STD_VER <= 2017
-_LIBCUDACXX_HIDE_FROM_ABI
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto get(const subrange<_Iter, _Sent, _Kind>& __subrange);
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto get(const subrange<_Iter, _Sent, _Kind>& __subrange);
 
 #  if _CCCL_STD_VER >= 2020
 template <size_t _Index, class _Iter, class _Sent, subrange_kind _Kind>
@@ -103,7 +99,7 @@ template <
   subrange_kind _Kind,
   enable_if_t<_Index<2, int> = 0>
 #  endif // _CCCL_STD_VER <= 2017
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto get(subrange<_Iter, _Sent, _Kind>&& __subrange);
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto get(subrange<_Iter, _Sent, _Kind>&& __subrange);
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__fwd/hash.h b/libcudacxx/include/cuda/std/__fwd/hash.h
index 1510e6cbfe..a05c4f3396 100644
--- a/libcudacxx/include/cuda/std/__fwd/hash.h
+++ b/libcudacxx/include/cuda/std/__fwd/hash.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class>
-struct _LIBCUDACXX_TEMPLATE_VIS hash;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__fwd/memory_resource.h b/libcudacxx/include/cuda/std/__fwd/memory_resource.h
index 4a8fc90d86..061fb7d88f 100644
--- a/libcudacxx/include/cuda/std/__fwd/memory_resource.h
+++ b/libcudacxx/include/cuda/std/__fwd/memory_resource.h
@@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 namespace pmr
 {
 template <class _ValueType>
-class _LIBCUDACXX_TEMPLATE_VIS polymorphic_allocator;
+class _CCCL_TYPE_VISIBILITY_DEFAULT polymorphic_allocator;
 } // namespace pmr
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__fwd/pair.h b/libcudacxx/include/cuda/std/__fwd/pair.h
index e8a1531dd9..f13d6b4be6 100644
--- a/libcudacxx/include/cuda/std/__fwd/pair.h
+++ b/libcudacxx/include/cuda/std/__fwd/pair.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class, class>
-struct _LIBCUDACXX_TEMPLATE_VIS pair;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT pair;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__fwd/string.h b/libcudacxx/include/cuda/std/__fwd/string.h
index de439fa184..3c74ca2e83 100644
--- a/libcudacxx/include/cuda/std/__fwd/string.h
+++ b/libcudacxx/include/cuda/std/__fwd/string.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _CharT>
-struct _LIBCUDACXX_TEMPLATE_VIS char_traits;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT char_traits;
 template <>
 struct char_traits<char>;
 
@@ -45,10 +45,10 @@ struct char_traits<wchar_t>;
 #endif
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS allocator;
+class _CCCL_TYPE_VISIBILITY_DEFAULT allocator;
 
 template <class _CharT, class _Traits = char_traits<_CharT>, class _Allocator = allocator<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_string;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_string;
 
 using string = basic_string<char>;
 
diff --git a/libcudacxx/include/cuda/std/__fwd/string_view.h b/libcudacxx/include/cuda/std/__fwd/string_view.h
index 319a8af9cc..8218cba125 100644
--- a/libcudacxx/include/cuda/std/__fwd/string_view.h
+++ b/libcudacxx/include/cuda/std/__fwd/string_view.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_string_view;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_string_view;
 
 typedef basic_string_view<char> string_view;
 #ifndef _LIBCUDACXX_NO_HAS_CHAR8_T
diff --git a/libcudacxx/include/cuda/std/__fwd/subrange.h b/libcudacxx/include/cuda/std/__fwd/subrange.h
index 4a6fab09fd..ba6b5e45ef 100644
--- a/libcudacxx/include/cuda/std/__fwd/subrange.h
+++ b/libcudacxx/include/cuda/std/__fwd/subrange.h
@@ -26,7 +26,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-enum class _LIBCUDACXX_ENUM_VIS subrange_kind : bool
+enum class _CCCL_TYPE_VISIBILITY_DEFAULT subrange_kind : bool
 {
   unsized,
   sized
@@ -38,7 +38,7 @@ template <input_or_output_iterator _Iter,
           sentinel_for<_Iter> _Sent = _Iter,
           subrange_kind _Kind       = sized_sentinel_for<_Sent, _Iter> ? subrange_kind::sized : subrange_kind::unsized>
   requires(_Kind == subrange_kind::sized || !sized_sentinel_for<_Sent, _Iter>)
-class _LIBCUDACXX_TEMPLATE_VIS subrange;
+class _CCCL_TYPE_VISIBILITY_DEFAULT subrange;
 #  else // ^^^ C++20 ^^^ / vvv C++17 vvv
 template <class _Iter,
           class _Sent         = _Iter,
@@ -46,7 +46,7 @@ template <class _Iter,
           enable_if_t<input_or_output_iterator<_Iter>, int>                                      = 0,
           enable_if_t<sentinel_for<_Sent, _Iter>, int>                                           = 0,
           enable_if_t<(_Kind == subrange_kind::sized || !sized_sentinel_for<_Sent, _Iter>), int> = 0>
-class _LIBCUDACXX_TEMPLATE_VIS subrange;
+class _CCCL_TYPE_VISIBILITY_DEFAULT subrange;
 #  endif // _CCCL_STD_VER <= 2017
 
 _LIBCUDACXX_END_NAMESPACE_RANGES_ABI
diff --git a/libcudacxx/include/cuda/std/__fwd/tuple.h b/libcudacxx/include/cuda/std/__fwd/tuple.h
index 95c7b532dd..cc30ddcbb6 100644
--- a/libcudacxx/include/cuda/std/__fwd/tuple.h
+++ b/libcudacxx/include/cuda/std/__fwd/tuple.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class...>
-class _LIBCUDACXX_TEMPLATE_VIS tuple;
+class _CCCL_TYPE_VISIBILITY_DEFAULT tuple;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__iterator/access.h b/libcudacxx/include/cuda/std/__iterator/access.h
index 6ff5bf0710..1cb9eb1382 100644
--- a/libcudacxx/include/cuda/std/__iterator/access.h
+++ b/libcudacxx/include/cuda/std/__iterator/access.h
@@ -30,20 +30,20 @@ namespace __begin
 struct __fn
 {
   template <class _Tp, size_t _Np>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp* operator()(_Tp (&__array)[_Np]) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp* operator()(_Tp (&__array)[_Np]) const noexcept
   {
     return __array;
   }
 
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
     noexcept(noexcept(__c.begin())) -> decltype(__c.begin())
   {
     return __c.begin();
   }
 
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
     noexcept(noexcept(__c.begin())) -> decltype(__c.begin())
   {
     return __c.begin();
@@ -61,20 +61,20 @@ namespace __end
 struct __fn
 {
   template <class _Tp, size_t _Np>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp* operator()(_Tp (&__array)[_Np]) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp* operator()(_Tp (&__array)[_Np]) const noexcept
   {
     return __array + _Np;
   }
 
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
     noexcept(noexcept(__c.end())) -> decltype(__c.end())
   {
     return __c.end();
   }
 
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
     noexcept(noexcept(__c.end())) -> decltype(__c.end())
   {
     return __c.end();
@@ -94,7 +94,7 @@ namespace __cbegin
 struct __fn
 {
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
     noexcept(noexcept(_CUDA_VSTD::begin(__c))) -> decltype(_CUDA_VSTD::begin(__c))
   {
     return _CUDA_VSTD::begin(__c);
@@ -112,7 +112,7 @@ namespace __cend
 struct __fn
 {
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
     noexcept(noexcept(_CUDA_VSTD::end(__c))) -> decltype(_CUDA_VSTD::end(__c))
   {
     return _CUDA_VSTD::end(__c);
diff --git a/libcudacxx/include/cuda/std/__iterator/advance.h b/libcudacxx/include/cuda/std/__iterator/advance.h
index ed6d711c0e..0a84feb53b 100644
--- a/libcudacxx/include/cuda/std/__iterator/advance.h
+++ b/libcudacxx/include/cuda/std/__iterator/advance.h
@@ -28,13 +28,13 @@
 #include <cuda/std/__iterator/iterator_traits.h>
 #include <cuda/std/__utility/convert_to_integral.h>
 #include <cuda/std/__utility/move.h>
+#include <cuda/std/cstdlib>
 #include <cuda/std/detail/libcxx/include/__assert>
-#include <cuda/std/detail/libcxx/include/cstdlib>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIter>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __advance(_InputIter& __i, typename iterator_traits<_InputIter>::difference_type __n, input_iterator_tag)
 {
   for (; __n > 0; --__n)
@@ -44,7 +44,7 @@ __advance(_InputIter& __i, typename iterator_traits<_InputIter>::difference_type
 }
 
 template <class _BiDirIter>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __advance(_BiDirIter& __i, typename iterator_traits<_BiDirIter>::difference_type __n, bidirectional_iterator_tag)
 {
   if (__n >= 0)
@@ -64,7 +64,7 @@ __advance(_BiDirIter& __i, typename iterator_traits<_BiDirIter>::difference_type
 }
 
 template <class _RandIter>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __advance(_RandIter& __i, typename iterator_traits<_RandIter>::difference_type __n, random_access_iterator_tag)
 {
   __i += __n;
@@ -74,7 +74,7 @@ template <class _InputIter,
           class _Distance,
           class _IntegralDistance = decltype(_CUDA_VSTD::__convert_to_integral(_CUDA_VSTD::declval<_Distance>())),
           class                   = __enable_if_t<is_integral<_IntegralDistance>::value>>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void advance(_InputIter& __i, _Distance __orig_n)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void advance(_InputIter& __i, _Distance __orig_n)
 {
   typedef typename iterator_traits<_InputIter>::difference_type _Difference;
   _Difference __n = static_cast<_Difference>(_CUDA_VSTD::__convert_to_integral(__orig_n));
@@ -95,7 +95,7 @@ struct __fn
 {
 private:
   template <class _Ip>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr void __advance_forward(_Ip& __i, iter_difference_t<_Ip> __n)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr void __advance_forward(_Ip& __i, iter_difference_t<_Ip> __n)
   {
     while (__n > 0)
     {
@@ -105,7 +105,7 @@ struct __fn
   }
 
   template <class _Ip>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr void __advance_backward(_Ip& __i, iter_difference_t<_Ip> __n)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr void __advance_backward(_Ip& __i, iter_difference_t<_Ip> __n)
   {
     while (__n < 0)
     {
@@ -115,8 +115,7 @@ struct __fn
   }
 
   template <class _Iter_difference>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
-  __magnitude_geq(_Iter_difference __a, _Iter_difference __b) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __magnitude_geq(_Iter_difference __a, _Iter_difference __b) noexcept
   {
     return __a == 0 ? __b == 0 : //
              __a > 0 ? __a >= __b
@@ -128,8 +127,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip)
   _LIBCUDACXX_REQUIRES(input_or_output_iterator<_Ip>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr void
-  operator()(_Ip& __i, iter_difference_t<_Ip> __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator()(_Ip& __i, iter_difference_t<_Ip> __n) const
   {
     _LIBCUDACXX_ASSERT(__n >= 0 || bidirectional_iterator<_Ip>,
                        "If `n < 0`, then `bidirectional_iterator<I>` must be true.");
@@ -158,7 +156,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip, class _Sp)
   _LIBCUDACXX_REQUIRES(input_or_output_iterator<_Ip> _LIBCUDACXX_AND sentinel_for<_Sp, _Ip>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator()(_Ip& __i, _Sp __bound_sentinel) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator()(_Ip& __i, _Sp __bound_sentinel) const
   {
     // If `I` and `S` model `assignable_from<I&, S>`, equivalent to `i = std::move(bound_sentinel)`.
     if constexpr (assignable_from<_Ip&, _Sp>)
@@ -189,7 +187,7 @@ struct __fn
   // Returns: `n - M`, where `M` is the difference between the ending and starting position.
   _LIBCUDACXX_TEMPLATE(class _Ip, class _Sp)
   _LIBCUDACXX_REQUIRES(input_or_output_iterator<_Ip> _LIBCUDACXX_AND sentinel_for<_Sp, _Ip>)
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_HIDE_FROM_ABI constexpr iter_difference_t<_Ip>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iter_difference_t<_Ip>
   operator()(_Ip& __i, iter_difference_t<_Ip> __n, _Sp __bound_sentinel) const
   {
     _LIBCUDACXX_ASSERT((__n >= 0) || (bidirectional_iterator<_Ip> && same_as<_Ip, _Sp>),
diff --git a/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h b/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h
index 22138f5177..dbb8e3f802 100644
--- a/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/back_insert_iterator.h
@@ -31,7 +31,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Container>
-class _LIBCUDACXX_TEMPLATE_VIS back_insert_iterator
+class _CCCL_TYPE_VISIBILITY_DEFAULT back_insert_iterator
 #if _CCCL_STD_VER <= 2014 || !defined(_LIBCUDACXX_ABI_NO_ITERATOR_BASES)
     : public iterator<output_iterator_tag, void, void, void, void>
 #endif
@@ -53,35 +53,35 @@ class _LIBCUDACXX_TEMPLATE_VIS back_insert_iterator
   typedef void reference;
   typedef _Container container_type;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 explicit back_insert_iterator(_Container& __x)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit back_insert_iterator(_Container& __x)
       : container(_CUDA_VSTD::addressof(__x))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 back_insert_iterator&
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 back_insert_iterator&
   operator=(const typename _Container::value_type& __value)
   {
     container->push_back(__value);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 back_insert_iterator&
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 back_insert_iterator&
   operator=(typename _Container::value_type&& __value)
   {
     container->push_back(_CUDA_VSTD::move(__value));
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 back_insert_iterator& operator*()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 back_insert_iterator& operator*()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 back_insert_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 back_insert_iterator& operator++()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 back_insert_iterator operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 back_insert_iterator operator++(int)
   {
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _Container* __get_container() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Container* __get_container() const
   {
     return container;
   }
@@ -89,8 +89,7 @@ class _LIBCUDACXX_TEMPLATE_VIS back_insert_iterator
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(back_insert_iterator);
 
 template <class _Container>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 back_insert_iterator<_Container>
-back_inserter(_Container& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 back_insert_iterator<_Container> back_inserter(_Container& __x)
 {
   return back_insert_iterator<_Container>(__x);
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/bounded_iter.h b/libcudacxx/include/cuda/std/__iterator/bounded_iter.h
index 4e754d6b27..2c2fc3f67b 100644
--- a/libcudacxx/include/cuda/std/__iterator/bounded_iter.h
+++ b/libcudacxx/include/cuda/std/__iterator/bounded_iter.h
@@ -56,22 +56,21 @@ struct __bounded_iter
   //
   // Such an iterator does not point to any object and is conceptually out of bounds, so it is
   // not dereferenceable. Observing operations like comparison and assignment are valid.
-  __bounded_iter() = default;
+  _CCCL_HIDE_FROM_ABI __bounded_iter() = default;
 
-  __bounded_iter(__bounded_iter const&) = default;
-  __bounded_iter(__bounded_iter&&)      = default;
+  _CCCL_HIDE_FROM_ABI __bounded_iter(__bounded_iter const&) = default;
+  _CCCL_HIDE_FROM_ABI __bounded_iter(__bounded_iter&&)      = default;
 
   template <class _OtherIterator, class = __enable_if_t<is_convertible<_OtherIterator, _Iterator>::value>>
-  _LIBCUDACXX_HIDE_FROM_ABI
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bounded_iter(__bounded_iter<_OtherIterator> const& __other) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __bounded_iter(__bounded_iter<_OtherIterator> const& __other) noexcept
       : __current_(__other.__current_)
       , __begin_(__other.__begin_)
       , __end_(__other.__end_)
   {}
 
   // Assign a bounded iterator to another one, rebinding the bounds of the iterator as well.
-  __bounded_iter& operator=(__bounded_iter const&) = default;
-  __bounded_iter& operator=(__bounded_iter&&)      = default;
+  _CCCL_HIDE_FROM_ABI __bounded_iter& operator=(__bounded_iter const&) = default;
+  _CCCL_HIDE_FROM_ABI __bounded_iter& operator=(__bounded_iter&&)      = default;
 
 private:
   // Create an iterator wrapping the given iterator, and whose bounds are described
@@ -83,7 +82,7 @@ struct __bounded_iter
   //
   // Since it is non-standard for iterators to have this constructor, __bounded_iter must
   // be created via `_CUDA_VSTD::__make_bounded_iter`.
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit __bounded_iter(_Iterator __current, _Iterator __begin, _Iterator __end)
       : __current_(__current)
       , __begin_(__begin)
@@ -93,29 +92,27 @@ struct __bounded_iter
   }
 
   template <class _It>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bounded_iter<_It>
-    __make_bounded_iter(_It, _It, _It);
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr __bounded_iter<_It> __make_bounded_iter(_It, _It, _It);
 
 public:
   // Dereference and indexing operations.
   //
   // These operations check that the iterator is dereferenceable, that is within [begin, end).
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator*() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator*() const noexcept
   {
     _LIBCUDACXX_ASSERT(__in_bounds(__current_),
                        "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
     return *__current_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pointer operator->() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pointer operator->() const noexcept
   {
     _LIBCUDACXX_ASSERT(__in_bounds(__current_),
                        "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
     return _CUDA_VSTD::__to_address(__current_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
-  operator[](difference_type __n) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator[](difference_type __n) const noexcept
   {
     _LIBCUDACXX_ASSERT(__in_bounds(__current_ + __n),
                        "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
@@ -126,44 +123,43 @@ struct __bounded_iter
   //
   // These operations do not check that the resulting iterator is within the bounds, since that
   // would make it impossible to create a past-the-end iterator.
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bounded_iter& operator++() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bounded_iter& operator++() noexcept
   {
     ++__current_;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bounded_iter operator++(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bounded_iter operator++(int) noexcept
   {
     __bounded_iter __tmp(*this);
     ++*this;
     return __tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bounded_iter& operator--() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bounded_iter& operator--() noexcept
   {
     --__current_;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bounded_iter operator--(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bounded_iter operator--(int) noexcept
   {
     __bounded_iter __tmp(*this);
     --*this;
     return __tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bounded_iter&
-  operator+=(difference_type __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bounded_iter& operator+=(difference_type __n) noexcept
   {
     __current_ += __n;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend __bounded_iter
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend __bounded_iter
   operator+(__bounded_iter const& __self, difference_type __n) noexcept
   {
     __bounded_iter __tmp(__self);
     __tmp += __n;
     return __tmp;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend __bounded_iter
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend __bounded_iter
   operator+(difference_type __n, __bounded_iter const& __self) noexcept
   {
     __bounded_iter __tmp(__self);
@@ -171,20 +167,19 @@ struct __bounded_iter
     return __tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bounded_iter&
-  operator-=(difference_type __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bounded_iter& operator-=(difference_type __n) noexcept
   {
     __current_ -= __n;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend __bounded_iter
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend __bounded_iter
   operator-(__bounded_iter const& __self, difference_type __n) noexcept
   {
     __bounded_iter __tmp(__self);
     __tmp -= __n;
     return __tmp;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 friend difference_type
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 friend difference_type
   operator-(__bounded_iter const& __x, __bounded_iter const& __y) noexcept
   {
     return __x.__current_ - __y.__current_;
@@ -196,32 +191,32 @@ struct __bounded_iter
   // The valid range for each iterator is also not considered as part of the comparison,
   // i.e. two iterators pointing to the same location will be considered equal even
   // if they have different validity ranges.
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr friend bool
   operator==(__bounded_iter const& __x, __bounded_iter const& __y) noexcept
   {
     return __x.__current_ == __y.__current_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr friend bool
   operator!=(__bounded_iter const& __x, __bounded_iter const& __y) noexcept
   {
     return __x.__current_ != __y.__current_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr friend bool
   operator<(__bounded_iter const& __x, __bounded_iter const& __y) noexcept
   {
     return __x.__current_ < __y.__current_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr friend bool
   operator>(__bounded_iter const& __x, __bounded_iter const& __y) noexcept
   {
     return __x.__current_ > __y.__current_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr friend bool
   operator<=(__bounded_iter const& __x, __bounded_iter const& __y) noexcept
   {
     return __x.__current_ <= __y.__current_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr friend bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr friend bool
   operator>=(__bounded_iter const& __x, __bounded_iter const& __y) noexcept
   {
     return __x.__current_ >= __y.__current_;
@@ -229,7 +224,7 @@ struct __bounded_iter
 
 private:
   // Return whether the given iterator is in the bounds of this __bounded_iter.
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool __in_bounds(_Iterator const& __iter) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __in_bounds(_Iterator const& __iter) const
   {
     return __iter >= __begin_ && __iter < __end_;
   }
@@ -241,8 +236,7 @@ struct __bounded_iter
 };
 
 template <class _It>
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bounded_iter<_It>
-__make_bounded_iter(_It __it, _It __begin, _It __end)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __bounded_iter<_It> __make_bounded_iter(_It __it, _It __begin, _It __end)
 {
   return __bounded_iter<_It>(_CUDA_VSTD::move(__it), _CUDA_VSTD::move(__begin), _CUDA_VSTD::move(__end));
 }
@@ -260,8 +254,7 @@ struct pointer_traits<__bounded_iter<_Iterator>>
   using element_type    = typename pointer_traits<_Iterator>::element_type;
   using difference_type = typename pointer_traits<_Iterator>::difference_type;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr static element_type*
-  to_address(pointer __it) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr static element_type* to_address(pointer __it) noexcept
   {
     return _CUDA_VSTD::__to_address(__it.__current_);
   }
diff --git a/libcudacxx/include/cuda/std/__iterator/data.h b/libcudacxx/include/cuda/std/__iterator/data.h
index 97dad1d0ff..f3fbb7941a 100644
--- a/libcudacxx/include/cuda/std/__iterator/data.h
+++ b/libcudacxx/include/cuda/std/__iterator/data.h
@@ -29,26 +29,25 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if _CCCL_STD_VER > 2011
 
 template <class _Cont>
-constexpr _LIBCUDACXX_INLINE_VISIBILITY auto data(_Cont& __c) noexcept(noexcept(__c.data())) -> decltype(__c.data())
+constexpr _LIBCUDACXX_HIDE_FROM_ABI auto data(_Cont& __c) noexcept(noexcept(__c.data())) -> decltype(__c.data())
 {
   return __c.data();
 }
 
 template <class _Cont>
-constexpr _LIBCUDACXX_INLINE_VISIBILITY auto
-data(const _Cont& __c) noexcept(noexcept(__c.data())) -> decltype(__c.data())
+constexpr _LIBCUDACXX_HIDE_FROM_ABI auto data(const _Cont& __c) noexcept(noexcept(__c.data())) -> decltype(__c.data())
 {
   return __c.data();
 }
 
 template <class _Tp, size_t _Sz>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* data(_Tp (&__array)[_Sz]) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp* data(_Tp (&__array)[_Sz]) noexcept
 {
   return __array;
 }
 
 template <class _Ep>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Ep* data(initializer_list<_Ep> __il) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr const _Ep* data(initializer_list<_Ep> __il) noexcept
 {
   return __il.begin();
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/distance.h b/libcudacxx/include/cuda/std/__iterator/distance.h
index 2ce304f95a..beec8adf38 100644
--- a/libcudacxx/include/cuda/std/__iterator/distance.h
+++ b/libcudacxx/include/cuda/std/__iterator/distance.h
@@ -33,7 +33,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIter>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 typename iterator_traits<_InputIter>::difference_type
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename iterator_traits<_InputIter>::difference_type
 __distance(_InputIter __first, _InputIter __last, input_iterator_tag)
 {
   typename iterator_traits<_InputIter>::difference_type __r(0);
@@ -45,14 +45,14 @@ __distance(_InputIter __first, _InputIter __last, input_iterator_tag)
 }
 
 template <class _RandIter>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 typename iterator_traits<_RandIter>::difference_type
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename iterator_traits<_RandIter>::difference_type
 __distance(_RandIter __first, _RandIter __last, random_access_iterator_tag)
 {
   return __last - __first;
 }
 
 template <class _InputIter>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 typename iterator_traits<_InputIter>::difference_type
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename iterator_traits<_InputIter>::difference_type
 distance(_InputIter __first, _InputIter __last)
 {
   return _CUDA_VSTD::__distance(__first, __last, typename iterator_traits<_InputIter>::iterator_category());
@@ -69,8 +69,7 @@ struct __fn
 {
   _LIBCUDACXX_TEMPLATE(class _Ip, class _Sp)
   _LIBCUDACXX_REQUIRES((sentinel_for<_Sp, _Ip> && !sized_sentinel_for<_Sp, _Ip>) )
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr iter_difference_t<_Ip>
-  operator()(_Ip __first, _Sp __last) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iter_difference_t<_Ip> operator()(_Ip __first, _Sp __last) const
   {
     iter_difference_t<_Ip> __n = 0;
     while (__first != __last)
@@ -83,8 +82,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip, class _Sp)
   _LIBCUDACXX_REQUIRES((sized_sentinel_for<_Sp, decay_t<_Ip>>) )
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr iter_difference_t<_Ip>
-  operator()(_Ip&& __first, _Sp __last) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iter_difference_t<_Ip> operator()(_Ip&& __first, _Sp __last) const
   {
     if constexpr (sized_sentinel_for<_Sp, remove_cvref_t<_Ip>>)
     {
@@ -99,7 +97,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Rp)
   _LIBCUDACXX_REQUIRES((range<_Rp>) )
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr range_difference_t<_Rp> operator()(_Rp&& __r) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr range_difference_t<_Rp> operator()(_Rp&& __r) const
   {
     if constexpr (sized_range<_Rp>)
     {
diff --git a/libcudacxx/include/cuda/std/__iterator/empty.h b/libcudacxx/include/cuda/std/__iterator/empty.h
index 2052800a31..4dea0eb53e 100644
--- a/libcudacxx/include/cuda/std/__iterator/empty.h
+++ b/libcudacxx/include/cuda/std/__iterator/empty.h
@@ -29,20 +29,20 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if _CCCL_STD_VER > 2011
 
 template <class _Cont>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
 empty(const _Cont& __c) noexcept(noexcept(__c.empty())) -> decltype(__c.empty())
 {
   return __c.empty();
 }
 
 template <class _Tp, size_t _Sz>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool empty(const _Tp (&)[_Sz]) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty(const _Tp (&)[_Sz]) noexcept
 {
   return false;
 }
 
 template <class _Ep>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool empty(initializer_list<_Ep> __il) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty(initializer_list<_Ep> __il) noexcept
 {
   return __il.size() == 0;
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/erase_if_container.h b/libcudacxx/include/cuda/std/__iterator/erase_if_container.h
index aba562a515..2d2b6e3576 100644
--- a/libcudacxx/include/cuda/std/__iterator/erase_if_container.h
+++ b/libcudacxx/include/cuda/std/__iterator/erase_if_container.h
@@ -24,8 +24,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Container, class _Predicate>
-_LIBCUDACXX_INLINE_VISIBILITY typename _Container::size_type
-__libcpp_erase_if_container(_Container& __c, _Predicate& __pred)
+_LIBCUDACXX_HIDE_FROM_ABI typename _Container::size_type __libcpp_erase_if_container(_Container& __c, _Predicate& __pred)
 {
   typename _Container::size_type __old_size = __c.size();
 
diff --git a/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h b/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h
index ba131efa0c..9918441ea0 100644
--- a/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/front_insert_iterator.h
@@ -31,7 +31,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Container>
-class _LIBCUDACXX_TEMPLATE_VIS front_insert_iterator
+class _CCCL_TYPE_VISIBILITY_DEFAULT front_insert_iterator
 #if _CCCL_STD_VER <= 2014 || !defined(_LIBCUDACXX_ABI_NO_ITERATOR_BASES)
     : public iterator<output_iterator_tag, void, void, void, void>
 #endif
@@ -53,30 +53,30 @@ class _LIBCUDACXX_TEMPLATE_VIS front_insert_iterator
   typedef void reference;
   typedef _Container container_type;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 explicit front_insert_iterator(_Container& __x)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit front_insert_iterator(_Container& __x)
       : container(_CUDA_VSTD::addressof(__x))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 front_insert_iterator&
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 front_insert_iterator&
   operator=(const typename _Container::value_type& __value)
   {
     container->push_front(__value);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 front_insert_iterator&
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 front_insert_iterator&
   operator=(typename _Container::value_type&& __value)
   {
     container->push_front(_CUDA_VSTD::move(__value));
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 front_insert_iterator& operator*()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 front_insert_iterator& operator*()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 front_insert_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 front_insert_iterator& operator++()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 front_insert_iterator operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 front_insert_iterator operator++(int)
   {
     return *this;
   }
@@ -84,8 +84,7 @@ class _LIBCUDACXX_TEMPLATE_VIS front_insert_iterator
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(front_insert_iterator);
 
 template <class _Container>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 front_insert_iterator<_Container>
-front_inserter(_Container& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 front_insert_iterator<_Container> front_inserter(_Container& __x)
 {
   return front_insert_iterator<_Container>(__x);
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h b/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h
index 55d7f360e1..eca27deea7 100644
--- a/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h
+++ b/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h
@@ -77,7 +77,7 @@ struct incrementable_traits<_Tp>
 };
 
 template <class>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
 
 // Let `RI` be `remove_cvref_t<I>`. The type `iter_difference_t<I>` denotes
 // `incrementable_traits<RI>::difference_type` if `iterator_traits<RI>` names a specialization
@@ -139,7 +139,7 @@ struct incrementable_traits<_Tp,
 };
 
 template <class, class = void>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
 
 // Let `RI` be `remove_cvref_t<I>`. The type `iter_difference_t<I>` denotes
 // `incrementable_traits<RI>::difference_type` if `iterator_traits<RI>` names a specialization
diff --git a/libcudacxx/include/cuda/std/__iterator/insert_iterator.h b/libcudacxx/include/cuda/std/__iterator/insert_iterator.h
index 8ad57e9710..5d63ba9184 100644
--- a/libcudacxx/include/cuda/std/__iterator/insert_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/insert_iterator.h
@@ -34,7 +34,7 @@ using __insert_iterator_iter_t = typename _Container::iterator;
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Container>
-class _LIBCUDACXX_TEMPLATE_VIS insert_iterator
+class _CCCL_TYPE_VISIBILITY_DEFAULT insert_iterator
 #if _CCCL_STD_VER <= 2014 || !defined(_LIBCUDACXX_ABI_NO_ITERATOR_BASES)
     : public iterator<output_iterator_tag, void, void, void, void>
 #endif
@@ -57,41 +57,40 @@ class _LIBCUDACXX_TEMPLATE_VIS insert_iterator
   typedef void reference;
   typedef _Container container_type;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20
   insert_iterator(_Container& __x, __insert_iterator_iter_t<_Container> __i)
       : container(_CUDA_VSTD::addressof(__x))
       , iter(__i)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 insert_iterator&
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 insert_iterator&
   operator=(const typename _Container::value_type& __value)
   {
     iter = container->insert(iter, __value);
     ++iter;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 insert_iterator&
-  operator=(typename _Container::value_type&& __value)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 insert_iterator& operator=(typename _Container::value_type&& __value)
   {
     iter = container->insert(iter, _CUDA_VSTD::move(__value));
     ++iter;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 insert_iterator& operator*()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 insert_iterator& operator*()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 insert_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 insert_iterator& operator++()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 insert_iterator& operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 insert_iterator& operator++(int)
   {
     return *this;
   }
 };
 
 template <class _Container>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 insert_iterator<_Container>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 insert_iterator<_Container>
 inserter(_Container& __x, __insert_iterator_iter_t<_Container> __i)
 {
   return insert_iterator<_Container>(__x, __i);
diff --git a/libcudacxx/include/cuda/std/__iterator/istream_iterator.h b/libcudacxx/include/cuda/std/__iterator/istream_iterator.h
index 0193d2cd2a..687ebe6986 100644
--- a/libcudacxx/include/cuda/std/__iterator/istream_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/istream_iterator.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Tp, class _CharT = char, class _Traits = char_traits<_CharT>, class _Distance = ptrdiff_t>
-class _LIBCUDACXX_TEMPLATE_VIS istream_iterator
+class _CCCL_TYPE_VISIBILITY_DEFAULT istream_iterator
 #if _CCCL_STD_VER <= 2014 || !defined(_LIBCUDACXX_ABI_NO_ITERATOR_BASES)
     : public iterator<input_iterator_tag, _Tp, _Distance, const _Tp*, const _Tp&>
 #endif
@@ -54,16 +54,16 @@ class _LIBCUDACXX_TEMPLATE_VIS istream_iterator
   _Tp __value_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr istream_iterator()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr istream_iterator()
       : __in_stream_(nullptr)
       , __value_()
   {}
 #if _CCCL_STD_VER > 2014
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr istream_iterator(default_sentinel_t)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr istream_iterator(default_sentinel_t)
       : istream_iterator()
   {}
 #endif // _CCCL_STD_VER > 2014
-  _LIBCUDACXX_INLINE_VISIBILITY istream_iterator(istream_type& __s)
+  _LIBCUDACXX_HIDE_FROM_ABI istream_iterator(istream_type& __s)
       : __in_stream_(_CUDA_VSTD::addressof(__s))
   {
     if (!(*__in_stream_ >> __value_))
@@ -72,15 +72,15 @@ class _LIBCUDACXX_TEMPLATE_VIS istream_iterator
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY const _Tp& operator*() const
+  _LIBCUDACXX_HIDE_FROM_ABI const _Tp& operator*() const
   {
     return __value_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY const _Tp* operator->() const
+  _LIBCUDACXX_HIDE_FROM_ABI const _Tp* operator->() const
   {
     return _CUDA_VSTD::addressof((operator*()));
   }
-  _LIBCUDACXX_INLINE_VISIBILITY istream_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI istream_iterator& operator++()
   {
     if (!(*__in_stream_ >> __value_))
     {
@@ -88,7 +88,7 @@ class _LIBCUDACXX_TEMPLATE_VIS istream_iterator
     }
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY istream_iterator operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI istream_iterator operator++(int)
   {
     istream_iterator __t(*this);
     ++(*this);
@@ -96,24 +96,24 @@ class _LIBCUDACXX_TEMPLATE_VIS istream_iterator
   }
 
   template <class _Up, class _CharU, class _TraitsU, class _DistanceU>
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator==(const istream_iterator<_Up, _CharU, _TraitsU, _DistanceU>& __x,
-                                                       const istream_iterator<_Up, _CharU, _TraitsU, _DistanceU>& __y);
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator==(const istream_iterator<_Up, _CharU, _TraitsU, _DistanceU>& __x,
+                                                   const istream_iterator<_Up, _CharU, _TraitsU, _DistanceU>& __y);
 
 #if _CCCL_STD_VER > 2014
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator==(const istream_iterator& __i, default_sentinel_t)
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator==(const istream_iterator& __i, default_sentinel_t)
   {
     return __i.__in_stream_ == nullptr;
   }
 #  if _CCCL_STD_VER < 2020
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator==(default_sentinel_t, const istream_iterator& __i)
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator==(default_sentinel_t, const istream_iterator& __i)
   {
     return __i.__in_stream_ == nullptr;
   }
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(const istream_iterator& __i, default_sentinel_t)
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator!=(const istream_iterator& __i, default_sentinel_t)
   {
     return __i.__in_stream_ != nullptr;
   }
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(default_sentinel_t, const istream_iterator& __i)
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator!=(default_sentinel_t, const istream_iterator& __i)
   {
     return __i.__in_stream_ != nullptr;
   }
@@ -122,15 +122,15 @@ class _LIBCUDACXX_TEMPLATE_VIS istream_iterator
 };
 
 template <class _Tp, class _CharT, class _Traits, class _Distance>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator==(const istream_iterator<_Tp, _CharT, _Traits, _Distance>& __x,
-                                                     const istream_iterator<_Tp, _CharT, _Traits, _Distance>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI bool operator==(const istream_iterator<_Tp, _CharT, _Traits, _Distance>& __x,
+                                          const istream_iterator<_Tp, _CharT, _Traits, _Distance>& __y)
 {
   return __x.__in_stream_ == __y.__in_stream_;
 }
 
 template <class _Tp, class _CharT, class _Traits, class _Distance>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(const istream_iterator<_Tp, _CharT, _Traits, _Distance>& __x,
-                                                     const istream_iterator<_Tp, _CharT, _Traits, _Distance>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI bool operator!=(const istream_iterator<_Tp, _CharT, _Traits, _Distance>& __x,
+                                          const istream_iterator<_Tp, _CharT, _Traits, _Distance>& __y)
 {
   return !(__x == __y);
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h b/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h
index 930101da6b..b716ea77c0 100644
--- a/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/istreambuf_iterator.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _CharT, class _Traits>
-class _LIBCUDACXX_TEMPLATE_VIS istreambuf_iterator
+class _CCCL_TYPE_VISIBILITY_DEFAULT istreambuf_iterator
 #if _CCCL_STD_VER <= 2014 || !defined(_LIBCUDACXX_ABI_NO_ITERATOR_BASES)
     : public iterator<input_iterator_tag, _CharT, typename _Traits::off_type, _CharT*, _CharT>
 #endif
@@ -56,20 +56,20 @@ class _LIBCUDACXX_TEMPLATE_VIS istreambuf_iterator
   {
     char_type __keep_;
     streambuf_type* __sbuf_;
-    _LIBCUDACXX_INLINE_VISIBILITY explicit __proxy(char_type __c, streambuf_type* __s)
+    _LIBCUDACXX_HIDE_FROM_ABI explicit __proxy(char_type __c, streambuf_type* __s)
         : __keep_(__c)
         , __sbuf_(__s)
     {}
     friend class istreambuf_iterator;
 
   public:
-    _LIBCUDACXX_INLINE_VISIBILITY char_type operator*() const
+    _LIBCUDACXX_HIDE_FROM_ABI char_type operator*() const
     {
       return __keep_;
     }
   };
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool __test_for_eof() const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __test_for_eof() const
   {
     if (__sbuf_ && traits_type::eq_int_type(__sbuf_->sgetc(), traits_type::eof()))
     {
@@ -79,39 +79,39 @@ class _LIBCUDACXX_TEMPLATE_VIS istreambuf_iterator
   }
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr istreambuf_iterator() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr istreambuf_iterator() noexcept
       : __sbuf_(nullptr)
   {}
 #if _CCCL_STD_VER > 2017
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr istreambuf_iterator(default_sentinel_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr istreambuf_iterator(default_sentinel_t) noexcept
       : istreambuf_iterator()
   {}
 #endif // _CCCL_STD_VER > 2017
-  _LIBCUDACXX_INLINE_VISIBILITY istreambuf_iterator(istream_type& __s) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI istreambuf_iterator(istream_type& __s) noexcept
       : __sbuf_(__s.rdbuf())
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY istreambuf_iterator(streambuf_type* __s) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI istreambuf_iterator(streambuf_type* __s) noexcept
       : __sbuf_(__s)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY istreambuf_iterator(const __proxy& __p) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI istreambuf_iterator(const __proxy& __p) noexcept
       : __sbuf_(__p.__sbuf_)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY char_type operator*() const
+  _LIBCUDACXX_HIDE_FROM_ABI char_type operator*() const
   {
     return static_cast<char_type>(__sbuf_->sgetc());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY istreambuf_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI istreambuf_iterator& operator++()
   {
     __sbuf_->sbumpc();
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY __proxy operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI __proxy operator++(int)
   {
     return __proxy(__sbuf_->sbumpc(), __sbuf_);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool equal(const istreambuf_iterator& __b) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool equal(const istreambuf_iterator& __b) const
   {
     return __test_for_eof() == __b.__test_for_eof();
   }
@@ -139,7 +139,7 @@ class _LIBCUDACXX_TEMPLATE_VIS istreambuf_iterator
 };
 
 template <class _CharT, class _Traits>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool
+_LIBCUDACXX_HIDE_FROM_ABI bool
 operator==(const istreambuf_iterator<_CharT, _Traits>& __a, const istreambuf_iterator<_CharT, _Traits>& __b)
 {
   return __a.equal(__b);
@@ -147,7 +147,7 @@ operator==(const istreambuf_iterator<_CharT, _Traits>& __a, const istreambuf_ite
 
 #if _CCCL_STD_VER <= 2017
 template <class _CharT, class _Traits>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool
+_LIBCUDACXX_HIDE_FROM_ABI bool
 operator!=(const istreambuf_iterator<_CharT, _Traits>& __a, const istreambuf_iterator<_CharT, _Traits>& __b)
 {
   return !__a.equal(__b);
diff --git a/libcudacxx/include/cuda/std/__iterator/iter_move.h b/libcudacxx/include/cuda/std/__iterator/iter_move.h
index 070da03f11..45192440a5 100644
--- a/libcudacxx/include/cuda/std/__iterator/iter_move.h
+++ b/libcudacxx/include/cuda/std/__iterator/iter_move.h
@@ -40,7 +40,7 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wvoid-ptr-dereference")
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__iter_move)
 
-_LIBCUDACXX_INLINE_VISIBILITY void iter_move();
+_CCCL_HOST_DEVICE void iter_move();
 
 #  if _CCCL_STD_VER >= 2020
 template <class _Tp>
@@ -94,7 +94,7 @@ struct __fn
 {
   _LIBCUDACXX_TEMPLATE(class _Ip)
   _LIBCUDACXX_REQUIRES(__unqualified_iter_move<_Ip>)
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto) operator()(_Ip&& __i) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) operator()(_Ip&& __i) const
     noexcept(noexcept(iter_move(_CUDA_VSTD::forward<_Ip>(__i))))
   {
     return iter_move(_CUDA_VSTD::forward<_Ip>(__i));
@@ -102,7 +102,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip)
   _LIBCUDACXX_REQUIRES(__move_deref<_Ip>)
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Ip&& __i) const noexcept(noexcept(
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Ip&& __i) const noexcept(noexcept(
     _CUDA_VSTD::move(*_CUDA_VSTD::forward<_Ip>(__i)))) -> decltype(_CUDA_VSTD::move(*_CUDA_VSTD::forward<_Ip>(__i)))
   {
     return _CUDA_VSTD::move(*_CUDA_VSTD::forward<_Ip>(__i));
@@ -110,7 +110,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip)
   _LIBCUDACXX_REQUIRES(__just_deref<_Ip>)
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Ip&& __i) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Ip&& __i) const
     noexcept(noexcept(*_CUDA_VSTD::forward<_Ip>(__i))) -> decltype(*_CUDA_VSTD::forward<_Ip>(__i))
   {
     return *_CUDA_VSTD::forward<_Ip>(__i);
diff --git a/libcudacxx/include/cuda/std/__iterator/iter_swap.h b/libcudacxx/include/cuda/std/__iterator/iter_swap.h
index f989719586..a10a7cbbd1 100644
--- a/libcudacxx/include/cuda/std/__iterator/iter_swap.h
+++ b/libcudacxx/include/cuda/std/__iterator/iter_swap.h
@@ -89,7 +89,7 @@ struct __fn
 {
   _LIBCUDACXX_TEMPLATE(class _T1, class _T2)
   _LIBCUDACXX_REQUIRES(__unqualified_iter_swap<_T1, _T2>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator()(_T1&& __x, _T2&& __y) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator()(_T1&& __x, _T2&& __y) const
     noexcept(noexcept(iter_swap(_CUDA_VSTD::forward<_T1>(__x), _CUDA_VSTD::forward<_T2>(__y))))
   {
     (void) iter_swap(_CUDA_VSTD::forward<_T1>(__x), _CUDA_VSTD::forward<_T2>(__y));
@@ -97,7 +97,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _T1, class _T2)
   _LIBCUDACXX_REQUIRES(__readable_swappable<_T1, _T2>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator()(_T1&& __x, _T2&& __y) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator()(_T1&& __x, _T2&& __y) const
     noexcept(noexcept(_CUDA_VRANGES::swap(*_CUDA_VSTD::forward<_T1>(__x), *_CUDA_VSTD::forward<_T2>(__y))))
   {
     _CUDA_VRANGES::swap(*_CUDA_VSTD::forward<_T1>(__x), *_CUDA_VSTD::forward<_T2>(__y));
@@ -105,7 +105,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _T1, class _T2)
   _LIBCUDACXX_REQUIRES(__moveable_storable<_T2, _T1>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator()(_T1&& __x, _T2&& __y) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator()(_T1&& __x, _T2&& __y) const
     noexcept(noexcept(iter_value_t<_T2>(_CUDA_VRANGES::iter_move(__y)))
              && noexcept(*__y = _CUDA_VRANGES::iter_move(__x))
              && noexcept(*_CUDA_VSTD::forward<_T1>(__x) = declval<iter_value_t<_T2>>()))
diff --git a/libcudacxx/include/cuda/std/__iterator/iterator.h b/libcudacxx/include/cuda/std/__iterator/iterator.h
index 6b59cf7f80..a85bbd4ef6 100644
--- a/libcudacxx/include/cuda/std/__iterator/iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/iterator.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Category, class _Tp, class _Distance = ptrdiff_t, class _Pointer = _Tp*, class _Reference = _Tp&>
-struct _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX17 iterator
+struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX17 iterator
 {
   typedef _Tp value_type;
   typedef _Distance difference_type;
diff --git a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
index b487ee7ab8..ae8e487367 100644
--- a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
+++ b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
@@ -85,7 +85,7 @@ template <__dereferenceable _Tp>
 using iter_reference_t = decltype(*declval<_Tp&>());
 
 template <class>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
 
 #elif _CCCL_STD_VER >= 2017
 
@@ -112,26 +112,26 @@ template <class _Tp>
 using iter_reference_t = enable_if_t<__dereferenceable<_Tp>, decltype(*_CUDA_VSTD::declval<_Tp&>())>;
 
 template <class, class>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
 #else // ^^^ _CCCL_STD_VER >= 2017 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv
 template <class>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
 #endif // _CCCL_STD_VER <= 2014
 
 #if defined(_CCCL_COMPILER_NVRTC)
 
-struct _LIBCUDACXX_TEMPLATE_VIS input_iterator_tag
+struct _CCCL_TYPE_VISIBILITY_DEFAULT input_iterator_tag
 {};
-struct _LIBCUDACXX_TEMPLATE_VIS output_iterator_tag
+struct _CCCL_TYPE_VISIBILITY_DEFAULT output_iterator_tag
 {};
-struct _LIBCUDACXX_TEMPLATE_VIS forward_iterator_tag : public input_iterator_tag
+struct _CCCL_TYPE_VISIBILITY_DEFAULT forward_iterator_tag : public input_iterator_tag
 {};
-struct _LIBCUDACXX_TEMPLATE_VIS bidirectional_iterator_tag : public forward_iterator_tag
+struct _CCCL_TYPE_VISIBILITY_DEFAULT bidirectional_iterator_tag : public forward_iterator_tag
 {};
-struct _LIBCUDACXX_TEMPLATE_VIS random_access_iterator_tag : public bidirectional_iterator_tag
+struct _CCCL_TYPE_VISIBILITY_DEFAULT random_access_iterator_tag : public bidirectional_iterator_tag
 {};
 #  if _CCCL_STD_VER >= 2014
-struct _LIBCUDACXX_TEMPLATE_VIS contiguous_iterator_tag : public random_access_iterator_tag
+struct _CCCL_TYPE_VISIBILITY_DEFAULT contiguous_iterator_tag : public random_access_iterator_tag
 {};
 #  endif // _CCCL_STD_VER >= 2014
 
@@ -144,14 +144,14 @@ using bidirectional_iterator_tag = ::std::bidirectional_iterator_tag;
 using random_access_iterator_tag = ::std::random_access_iterator_tag;
 
 #  if _CCCL_STD_VER >= 2020
-struct _LIBCUDACXX_TEMPLATE_VIS __contiguous_iterator_tag_backfill : public ::std::random_access_iterator_tag
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __contiguous_iterator_tag_backfill : public ::std::random_access_iterator_tag
 {};
 using contiguous_iterator_tag =
   _If<::std::__cccl_std_contiguous_iterator_tag_exists::value,
       ::std::contiguous_iterator_tag,
       __contiguous_iterator_tag_backfill>;
 #  elif _CCCL_STD_VER >= 2014
-struct _LIBCUDACXX_TEMPLATE_VIS contiguous_iterator_tag : public random_access_iterator_tag
+struct _CCCL_TYPE_VISIBILITY_DEFAULT contiguous_iterator_tag : public random_access_iterator_tag
 {};
 #  endif // _CCCL_STD_VER >= 2014
 
@@ -203,9 +203,9 @@ struct __has_iterator_typedefs
 {
 private:
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static false_type __test(...);
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static true_type
+  _LIBCUDACXX_HIDE_FROM_ABI static true_type
   __test(void_t<typename _Up::iterator_category>* = nullptr,
          void_t<typename _Up::difference_type>*   = nullptr,
          void_t<typename _Up::value_type>*        = nullptr,
@@ -221,9 +221,9 @@ struct __has_iterator_category
 {
 private:
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static false_type __test(...);
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static true_type __test(typename _Up::iterator_category* = nullptr);
+  _LIBCUDACXX_HIDE_FROM_ABI static true_type __test(typename _Up::iterator_category* = nullptr);
 
 public:
   static const bool value = decltype(__test<_Tp>(nullptr))::value;
@@ -234,9 +234,9 @@ struct __has_iterator_concept
 {
 private:
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static false_type __test(...);
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static true_type __test(typename _Up::iterator_concept* = nullptr);
+  _LIBCUDACXX_HIDE_FROM_ABI static true_type __test(typename _Up::iterator_concept* = nullptr);
 
 public:
   static const bool value = decltype(__test<_Tp>(nullptr))::value;
@@ -479,7 +479,7 @@ struct __iterator_traits<_Ip>
 };
 
 template <class _Ip>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits : __iterator_traits<_Ip>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits : __iterator_traits<_Ip>
 {
   using __primary_template = iterator_traits;
 };
@@ -761,7 +761,7 @@ struct __iterator_traits<_Ip,
 };
 
 template <class _Ip, class>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits : __iterator_traits<_Ip>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits : __iterator_traits<_Ip>
 {
   using __primary_template = iterator_traits;
 };
@@ -799,7 +799,7 @@ struct __iterator_traits<_Iter, true>
 //    the client expects instead of failing at compile time.
 
 template <class _Iter>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits : __iterator_traits<_Iter, __has_iterator_typedefs<_Iter>::value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits : __iterator_traits<_Iter, __has_iterator_typedefs<_Iter>::value>
 {
   using __primary_template = iterator_traits;
 };
@@ -809,7 +809,7 @@ template <class _Tp>
 #if _CCCL_STD_VER >= 2020
   requires is_object_v<_Tp>
 #endif
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits<_Tp*>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits<_Tp*>
 {
   typedef ptrdiff_t difference_type;
   typedef __remove_cv_t<_Tp> value_type;
diff --git a/libcudacxx/include/cuda/std/__iterator/move_iterator.h b/libcudacxx/include/cuda/std/__iterator/move_iterator.h
index ef46b42074..fb461c2625 100644
--- a/libcudacxx/include/cuda/std/__iterator/move_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/move_iterator.h
@@ -41,7 +41,7 @@
 #include <cuda/std/__type_traits/is_reference.h>
 #include <cuda/std/__type_traits/remove_reference.h>
 #include <cuda/std/__utility/move.h>
-#include <cuda/std/detail/libcxx/include/cstdlib>
+#include <cuda/std/cstdlib>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -96,7 +96,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool __noexcept_move_iter_iter_move =
 #endif // _CCCL_STD_VER >= 2017
 
 template <class _Iter>
-class _LIBCUDACXX_TEMPLATE_VIS move_iterator
+class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator
 #if _CCCL_STD_VER > 2014
     : public __move_iter_category_base<_Iter>
 #endif
@@ -109,7 +109,7 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
 
 #if _CCCL_STD_VER >= 2017
 #  if !defined(_CCCL_COMPILER_MSVC_2017)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto __mi_get_iter_concept()
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __mi_get_iter_concept()
   {
     if constexpr (random_access_iterator<_Iter>)
     {
@@ -165,73 +165,71 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
     reference;
 #endif // _CCCL_STD_VER < 2017
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit move_iterator(_Iter __i)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit move_iterator(_Iter __i)
       : __current_(_CUDA_VSTD::move(__i))
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator& operator++()
   {
     ++__current_;
     return *this;
   }
 
-  _LIBCUDACXX_DEPRECATED_IN_CXX20 _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pointer
-  operator->() const
+  _LIBCUDACXX_DEPRECATED_IN_CXX20 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pointer operator->() const
   {
     return __current_;
   }
 
 #if _CCCL_STD_VER > 2014
 #  if _CCCL_STD_VER > 2017
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr move_iterator()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator()
     requires is_constructible_v<_Iter>
       : __current_()
   {}
 #  else // ^^^ _CCCL_STD_VER > 2017 ^^^ / vvv _CCCL_STD_VER < 2020 vvv
   _LIBCUDACXX_TEMPLATE(class _It2 = _Iter)
   _LIBCUDACXX_REQUIRES(is_constructible_v<_It2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr move_iterator()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator()
       : __current_()
   {}
 #  endif // _CCCL_STD_VER < 2020
 
   _LIBCUDACXX_TEMPLATE(class _Up)
   _LIBCUDACXX_REQUIRES((!_IsSame<_Up, _Iter>::value) && convertible_to<const _Up&, _Iter>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr move_iterator(const move_iterator<_Up>& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator(const move_iterator<_Up>& __u)
       : __current_(__u.base())
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Up)
   _LIBCUDACXX_REQUIRES((!_IsSame<_Up, _Iter>::value)
                        && convertible_to<const _Up&, _Iter> && assignable_from<_Iter&, const _Up&>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr move_iterator&
-  operator=(const move_iterator<_Up>& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator& operator=(const move_iterator<_Up>& __u)
   {
     __current_ = __u.base();
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const _Iter& base() const& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Iter& base() const& noexcept
   {
     return __current_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Iter base() &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Iter base() &&
   {
     return _CUDA_VSTD::move(__current_);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr reference operator*() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference operator*() const
   {
     return _CUDA_VRANGES::iter_move(__current_);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr reference operator[](difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference operator[](difference_type __n) const
   {
     return _CUDA_VRANGES::iter_move(__current_ + __n);
   }
 
   _LIBCUDACXX_TEMPLATE(class _It2 = _Iter)
   _LIBCUDACXX_REQUIRES(forward_iterator<_It2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator++(int)
   {
     move_iterator __tmp(*this);
     ++__current_;
@@ -240,37 +238,35 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
 
   _LIBCUDACXX_TEMPLATE(class _It2 = _Iter)
   _LIBCUDACXX_REQUIRES((!forward_iterator<_It2>) )
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator++(int)
   {
     ++__current_;
   }
 #else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER < 2017 vvv
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator()
       : __current_()
   {}
 
   template <class _Up, class = __enable_if_t<!is_same<_Up, _Iter>::value && is_convertible<const _Up&, _Iter>::value>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  move_iterator(const move_iterator<_Up>& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator(const move_iterator<_Up>& __u)
       : __current_(__u.base())
   {}
 
   template <class _Up,
             class = __enable_if_t<!is_same<_Up, _Iter>::value && is_convertible<const _Up&, _Iter>::value
                                   && is_assignable<_Iter&, const _Up&>::value>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator&
-  operator=(const move_iterator<_Up>& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator& operator=(const move_iterator<_Up>& __u)
   {
     __current_ = __u.base();
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Iter base() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Iter base() const
   {
     return __current_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator*() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator*() const
   {
     return static_cast<reference>(*__current_);
   }
@@ -278,15 +274,14 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
   _CCCL_DIAG_PUSH
   _CCCL_DIAG_SUPPRESS_MSVC(4172) // returning address of local variable or temporary
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
-  operator[](difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator[](difference_type __n) const
   {
     return static_cast<reference>(__current_[__n]);
   }
 
   _CCCL_DIAG_POP
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator operator++(int)
   {
     move_iterator __tmp(*this);
     ++__current_;
@@ -294,35 +289,31 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
   }
 #endif // _CCCL_STD_VER < 2017
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator& operator--()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator& operator--()
   {
     --__current_;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator operator--(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator operator--(int)
   {
     move_iterator __tmp(*this);
     --__current_;
     return __tmp;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator
-  operator+(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator operator+(difference_type __n) const
   {
     return move_iterator(__current_ + __n);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator&
-  operator+=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator& operator+=(difference_type __n)
   {
     __current_ += __n;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator
-  operator-(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator operator-(difference_type __n) const
   {
     return move_iterator(__current_ - __n);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator&
-  operator-=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator& operator-=(difference_type __n)
   {
     __current_ -= __n;
     return *this;
@@ -331,8 +322,7 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
 #if _CCCL_STD_VER > 2014
   _LIBCUDACXX_TEMPLATE(class _Sent)
   _LIBCUDACXX_REQUIRES(sentinel_for<_Sent, _Iter> _LIBCUDACXX_AND __move_iter_comparable<_Iter, _Sent>)
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const move_iterator& __x, const move_sentinel<_Sent>& __y)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const move_iterator& __x, const move_sentinel<_Sent>& __y)
   {
     return __x.base() == __y.base();
   }
@@ -340,24 +330,21 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
 #  if _CCCL_STD_VER < 2020
   _LIBCUDACXX_TEMPLATE(class _Sent)
   _LIBCUDACXX_REQUIRES(sentinel_for<_Sent, _Iter> _LIBCUDACXX_AND __move_iter_comparable<_Iter, _Sent>)
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator==(const move_sentinel<_Sent>& __y, const move_iterator& __x)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const move_sentinel<_Sent>& __y, const move_iterator& __x)
   {
     return __y.base() == __x.base();
   }
 
   _LIBCUDACXX_TEMPLATE(class _Sent)
   _LIBCUDACXX_REQUIRES(sentinel_for<_Sent, _Iter> _LIBCUDACXX_AND __move_iter_comparable<_Iter, _Sent>)
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator!=(const move_iterator& __x, const move_sentinel<_Sent>& __y)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const move_iterator& __x, const move_sentinel<_Sent>& __y)
   {
     return __x.base() != __y.base();
   }
 
   _LIBCUDACXX_TEMPLATE(class _Sent)
   _LIBCUDACXX_REQUIRES(sentinel_for<_Sent, _Iter> _LIBCUDACXX_AND __move_iter_comparable<_Iter, _Sent>)
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
-  operator!=(const move_sentinel<_Sent>& __y, const move_iterator& __x)
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const move_sentinel<_Sent>& __y, const move_iterator& __x)
   {
     return __y.base() != __x.base();
   }
@@ -365,7 +352,7 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
 
   _LIBCUDACXX_TEMPLATE(class _Sent)
   _LIBCUDACXX_REQUIRES(sized_sentinel_for<_Sent, _Iter>)
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr iter_difference_t<_Iter>
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr iter_difference_t<_Iter>
   operator-(const move_sentinel<_Sent>& __x, const move_iterator& __y)
   {
     return __x.base() - __y.base();
@@ -373,13 +360,13 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
 
   _LIBCUDACXX_TEMPLATE(class _Sent)
   _LIBCUDACXX_REQUIRES(sized_sentinel_for<_Sent, _Iter>)
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr iter_difference_t<_Iter>
+  friend _LIBCUDACXX_HIDE_FROM_ABI constexpr iter_difference_t<_Iter>
   operator-(const move_iterator& __x, const move_sentinel<_Sent>& __y)
   {
     return __x.base() - __y.base();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr iter_rvalue_reference_t<_Iter>
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr iter_rvalue_reference_t<_Iter>
   iter_move(const move_iterator& __i) noexcept(__noexcept_move_iter_iter_move<_Iter>)
   {
     return _CUDA_VRANGES::iter_move(__i.__current_);
@@ -387,7 +374,7 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
 
 #  if defined(_CCCL_COMPILER_MSVC_2017) // MSVC2017 cannot find _Iter otherwise
   template <class _Iter2, class _Iter1 = _Iter>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr auto iter_swap(
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto iter_swap(
     const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) noexcept(__noexcept_swappable<_Iter1, _Iter2>)
     _LIBCUDACXX_TRAILING_REQUIRES(void)(same_as<_Iter1, _Iter>&& indirectly_swappable<_Iter2, _Iter1>)
   {
@@ -395,7 +382,7 @@ class _LIBCUDACXX_TEMPLATE_VIS move_iterator
   }
 #  else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv
   template <class _Iter2>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto
   iter_swap(const move_iterator& __x, const move_iterator<_Iter2>& __y) noexcept(__noexcept_swappable<_Iter, _Iter2>)
     _LIBCUDACXX_TRAILING_REQUIRES(void)(indirectly_swappable<_Iter2, _Iter>)
   {
@@ -414,7 +401,7 @@ struct _IsFancyPointer<move_iterator<_Iter>> : _IsFancyPointer<_Iter>
 #endif // _CCCL_COMPILER_GCC || _CCCL_COMPILER_MSVC
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator==(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
 {
   return __x.base() == __y.base();
@@ -422,7 +409,7 @@ operator==(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
 
 #if _CCCL_STD_VER <= 2017
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator!=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
 {
   return __x.base() != __y.base();
@@ -432,36 +419,36 @@ operator!=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
 #ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
 template <class _Iter1, three_way_comparable_with<_Iter1> _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator<=>(
-  const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) -> compare_three_way_result_t<_Iter1, _Iter2>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator<=>(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
+  -> compare_three_way_result_t<_Iter1, _Iter2>
 {
   return __x.base() <=> __y.base();
 }
 
 #else // ^^^ !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR ^^^ / vvv _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR vvv
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
 {
   return __x.base() < __y.base();
 }
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
 {
   return __x.base() > __y.base();
 }
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
 {
   return __x.base() <= __y.base();
 }
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
 {
   return __x.base() >= __y.base();
@@ -469,7 +456,7 @@ operator>=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y)
 #endif // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto
 operator-(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) -> decltype(__x.base() - __y.base())
 {
   return __x.base() - __y.base();
@@ -477,7 +464,7 @@ operator-(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) ->
 
 #if _CCCL_STD_VER > 2017
 template <class _Iter>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr move_iterator<_Iter>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator<_Iter>
 operator+(iter_difference_t<_Iter> __n, const move_iterator<_Iter>& __x)
   requires requires {
     { __x.base() + __n } -> same_as<_Iter>;
@@ -487,7 +474,7 @@ operator+(iter_difference_t<_Iter> __n, const move_iterator<_Iter>& __x)
 }
 #else // ^^^ _CCCL_STD_VER > 2017 ^^^ / vvv _CCCL_STD_VER < 2020 vvv
 template <class _Iter>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator<_Iter>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator<_Iter>
 operator+(typename move_iterator<_Iter>::difference_type __n, const move_iterator<_Iter>& __x)
 {
   return move_iterator<_Iter>(__x.base() + __n);
@@ -495,8 +482,7 @@ operator+(typename move_iterator<_Iter>::difference_type __n, const move_iterato
 #endif // _CCCL_STD_VER < 2020
 
 template <class _Iter>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 move_iterator<_Iter>
-make_move_iterator(_Iter __i)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator<_Iter> make_move_iterator(_Iter __i)
 {
   return move_iterator<_Iter>(_CUDA_VSTD::move(__i));
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/move_sentinel.h b/libcudacxx/include/cuda/std/__iterator/move_sentinel.h
index eedcdad4b1..29bcaae141 100644
--- a/libcudacxx/include/cuda/std/__iterator/move_sentinel.h
+++ b/libcudacxx/include/cuda/std/__iterator/move_sentinel.h
@@ -34,31 +34,30 @@ template <semiregular _Sent>
 #  else
 template <class _Sent, enable_if_t<semiregular<_Sent>, int> = 0>
 #  endif
-class _LIBCUDACXX_TEMPLATE_VIS move_sentinel
+class _CCCL_TYPE_VISIBILITY_DEFAULT move_sentinel
 {
 public:
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr move_sentinel() = default;
+  _CCCL_HIDE_FROM_ABI constexpr move_sentinel() = default;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit move_sentinel(_Sent __s)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit move_sentinel(_Sent __s)
       : __last_(_CUDA_VSTD::move(__s))
   {}
 
   _LIBCUDACXX_TEMPLATE(class _S2)
   _LIBCUDACXX_REQUIRES(convertible_to<const _S2&, _Sent>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr move_sentinel(const move_sentinel<_S2>& __s)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr move_sentinel(const move_sentinel<_S2>& __s)
       : __last_(__s.base())
   {}
 
   _LIBCUDACXX_TEMPLATE(class _S2)
   _LIBCUDACXX_REQUIRES(assignable_from<const _S2&, _Sent>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr move_sentinel&
-  operator=(const move_sentinel<_S2>& __s)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr move_sentinel& operator=(const move_sentinel<_S2>& __s)
   {
     __last_ = __s.base();
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _Sent base() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Sent base() const
   {
     return __last_;
   }
diff --git a/libcudacxx/include/cuda/std/__iterator/next.h b/libcudacxx/include/cuda/std/__iterator/next.h
index c4e70d2c4a..d31050637b 100644
--- a/libcudacxx/include/cuda/std/__iterator/next.h
+++ b/libcudacxx/include/cuda/std/__iterator/next.h
@@ -31,8 +31,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIter>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_cpp17_input_iterator<_InputIter>::value, _InputIter>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_cpp17_input_iterator<_InputIter>::value, _InputIter>
 next(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = 1)
 {
   _LIBCUDACXX_ASSERT(__n >= 0 || __is_cpp17_bidirectional_iterator<_InputIter>::value,
@@ -54,7 +53,7 @@ struct __fn
 {
   _LIBCUDACXX_TEMPLATE(class _Ip)
   _LIBCUDACXX_REQUIRES(input_or_output_iterator<_Ip>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Ip operator()(_Ip __x) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const
   {
     ++__x;
     return __x;
@@ -62,8 +61,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip)
   _LIBCUDACXX_REQUIRES(input_or_output_iterator<_Ip>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Ip
-  operator()(_Ip __x, iter_difference_t<_Ip> __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const
   {
     _CUDA_VRANGES::advance(__x, __n);
     return __x;
@@ -71,7 +69,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip, class _Sp)
   _LIBCUDACXX_REQUIRES(input_or_output_iterator<_Ip>&& sentinel_for<_Sp, _Ip>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Ip operator()(_Ip __x, _Sp __bound_sentinel) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, _Sp __bound_sentinel) const
   {
     _CUDA_VRANGES::advance(__x, __bound_sentinel);
     return __x;
@@ -79,8 +77,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip, class _Sp)
   _LIBCUDACXX_REQUIRES(input_or_output_iterator<_Ip>&& sentinel_for<_Sp, _Ip>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Ip
-  operator()(_Ip __x, iter_difference_t<_Ip> __n, _Sp __bound_sentinel) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n, _Sp __bound_sentinel) const
   {
     _CUDA_VRANGES::advance(__x, __n, __bound_sentinel);
     return __x;
diff --git a/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h b/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h
index ff0c3ae1d5..19d70cbd18 100644
--- a/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/ostream_iterator.h
@@ -31,7 +31,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Tp, class _CharT = char, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS ostream_iterator
+class _CCCL_TYPE_VISIBILITY_DEFAULT ostream_iterator
 #if _CCCL_STD_VER <= 2014 || !defined(_LIBCUDACXX_ABI_NO_ITERATOR_BASES)
     : public iterator<output_iterator_tag, void, void, void, void>
 #endif
@@ -57,15 +57,15 @@ class _LIBCUDACXX_TEMPLATE_VIS ostream_iterator
   const char_type* __delim_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY ostream_iterator(ostream_type& __s) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI ostream_iterator(ostream_type& __s) noexcept
       : __out_stream_(_CUDA_VSTD::addressof(__s))
       , __delim_(nullptr)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY ostream_iterator(ostream_type& __s, const _CharT* __delimiter) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI ostream_iterator(ostream_type& __s, const _CharT* __delimiter) noexcept
       : __out_stream_(_CUDA_VSTD::addressof(__s))
       , __delim_(__delimiter)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY ostream_iterator& operator=(const _Tp& __value)
+  _LIBCUDACXX_HIDE_FROM_ABI ostream_iterator& operator=(const _Tp& __value)
   {
     *__out_stream_ << __value;
     if (__delim_)
@@ -75,15 +75,15 @@ class _LIBCUDACXX_TEMPLATE_VIS ostream_iterator
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY ostream_iterator& operator*()
+  _LIBCUDACXX_HIDE_FROM_ABI ostream_iterator& operator*()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY ostream_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI ostream_iterator& operator++()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY ostream_iterator& operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI ostream_iterator& operator++(int)
   {
     return *this;
   }
diff --git a/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h b/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h
index 58517c105f..f7a7ae1966 100644
--- a/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/ostreambuf_iterator.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _CharT, class _Traits>
-class _LIBCUDACXX_TEMPLATE_VIS ostreambuf_iterator
+class _CCCL_TYPE_VISIBILITY_DEFAULT ostreambuf_iterator
 #if _CCCL_STD_VER <= 2014 || !defined(_LIBCUDACXX_ABI_NO_ITERATOR_BASES)
     : public iterator<output_iterator_tag, void, void, void, void>
 #endif
@@ -56,13 +56,13 @@ class _LIBCUDACXX_TEMPLATE_VIS ostreambuf_iterator
   streambuf_type* __sbuf_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY ostreambuf_iterator(ostream_type& __s) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI ostreambuf_iterator(ostream_type& __s) noexcept
       : __sbuf_(__s.rdbuf())
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY ostreambuf_iterator(streambuf_type* __s) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI ostreambuf_iterator(streambuf_type* __s) noexcept
       : __sbuf_(__s)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY ostreambuf_iterator& operator=(_CharT __c)
+  _LIBCUDACXX_HIDE_FROM_ABI ostreambuf_iterator& operator=(_CharT __c)
   {
     if (__sbuf_ && traits_type::eq_int_type(__sbuf_->sputc(__c), traits_type::eof()))
     {
@@ -70,25 +70,25 @@ class _LIBCUDACXX_TEMPLATE_VIS ostreambuf_iterator
     }
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY ostreambuf_iterator& operator*()
+  _LIBCUDACXX_HIDE_FROM_ABI ostreambuf_iterator& operator*()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY ostreambuf_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI ostreambuf_iterator& operator++()
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY ostreambuf_iterator& operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI ostreambuf_iterator& operator++(int)
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY bool failed() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI bool failed() const noexcept
   {
     return __sbuf_ == nullptr;
   }
 
   template <class _Ch, class _Tr>
-  friend _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY ostreambuf_iterator<_Ch, _Tr> __pad_and_output(
+  friend _LIBCUDACXX_HIDE_FROM_ABI ostreambuf_iterator<_Ch, _Tr> __pad_and_output(
     ostreambuf_iterator<_Ch, _Tr> __s, const _Ch* __ob, const _Ch* __op, const _Ch* __oe, ios_base& __iob, _Ch __fl);
 };
 
diff --git a/libcudacxx/include/cuda/std/__iterator/prev.h b/libcudacxx/include/cuda/std/__iterator/prev.h
index 5a84b44c59..a69f4f456c 100644
--- a/libcudacxx/include/cuda/std/__iterator/prev.h
+++ b/libcudacxx/include/cuda/std/__iterator/prev.h
@@ -31,8 +31,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIter>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_cpp17_input_iterator<_InputIter>::value, _InputIter>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_cpp17_input_iterator<_InputIter>::value, _InputIter>
 prev(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = 1)
 {
   _LIBCUDACXX_ASSERT(__n <= 0 || __is_cpp17_bidirectional_iterator<_InputIter>::value,
@@ -53,7 +52,7 @@ struct __fn
 {
   _LIBCUDACXX_TEMPLATE(class _Ip)
   _LIBCUDACXX_REQUIRES(bidirectional_iterator<_Ip>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Ip operator()(_Ip __x) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x) const
   {
     --__x;
     return __x;
@@ -61,8 +60,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip)
   _LIBCUDACXX_REQUIRES(bidirectional_iterator<_Ip>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Ip
-  operator()(_Ip __x, iter_difference_t<_Ip> __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n) const
   {
     _CUDA_VRANGES::advance(__x, -__n);
     return __x;
@@ -70,8 +68,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Ip)
   _LIBCUDACXX_REQUIRES(bidirectional_iterator<_Ip>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Ip
-  operator()(_Ip __x, iter_difference_t<_Ip> __n, _Ip __bound_iter) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Ip operator()(_Ip __x, iter_difference_t<_Ip> __n, _Ip __bound_iter) const
   {
     _CUDA_VRANGES::advance(__x, -__n, __bound_iter);
     return __x;
diff --git a/libcudacxx/include/cuda/std/__iterator/projected.h b/libcudacxx/include/cuda/std/__iterator/projected.h
index b0316545df..0882b15c3d 100644
--- a/libcudacxx/include/cuda/std/__iterator/projected.h
+++ b/libcudacxx/include/cuda/std/__iterator/projected.h
@@ -34,7 +34,7 @@ _LIBCUDACXX_REQUIRES(indirectly_readable<_It> _LIBCUDACXX_AND indirectly_regular
 struct projected
 {
   using value_type = remove_cvref_t<indirect_result_t<_Proj&, _It>>;
-  _LIBCUDACXX_INLINE_VISIBILITY indirect_result_t<_Proj&, _It> operator*() const; // not defined
+  _LIBCUDACXX_HIDE_FROM_ABI indirect_result_t<_Proj&, _It> operator*() const; // not defined
 };
 
 #  if _CCCL_STD_VER > 2017
diff --git a/libcudacxx/include/cuda/std/__iterator/readable_traits.h b/libcudacxx/include/cuda/std/__iterator/readable_traits.h
index e263d3ff24..9515e58d24 100644
--- a/libcudacxx/include/cuda/std/__iterator/readable_traits.h
+++ b/libcudacxx/include/cuda/std/__iterator/readable_traits.h
@@ -95,7 +95,7 @@ struct indirectly_readable_traits<_Tp> : __cond_value_type<typename _Tp::value_t
 {};
 
 template <class>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
 
 // Let `RI` be `remove_cvref_t<I>`. The type `iter_value_t<I>` denotes
 // `indirectly_readable_traits<RI>::value_type` if `iterator_traits<RI>` names a specialization
@@ -172,7 +172,7 @@ struct indirectly_readable_traits<
 {};
 
 template <class, class>
-struct _LIBCUDACXX_TEMPLATE_VIS iterator_traits;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
 
 // Let `RI` be `remove_cvref_t<I>`. The type `iter_value_t<I>` denotes
 // `indirectly_readable_traits<RI>::value_type` if `iterator_traits<RI>` names a specialization
diff --git a/libcudacxx/include/cuda/std/__iterator/reverse_access.h b/libcudacxx/include/cuda/std/__iterator/reverse_access.h
index 927b3a3d0e..ffeed85b90 100644
--- a/libcudacxx/include/cuda/std/__iterator/reverse_access.h
+++ b/libcudacxx/include/cuda/std/__iterator/reverse_access.h
@@ -34,28 +34,27 @@ namespace __rbegin
 struct __fn
 {
   template <class _Tp, size_t _Np>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator<_Tp*>
-  operator()(_Tp (&__array)[_Np]) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator<_Tp*> operator()(_Tp (&__array)[_Np]) const noexcept
   {
     return reverse_iterator<_Tp*>(__array + _Np);
   }
 
   template <class _Ep>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator<const _Ep*>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator<const _Ep*>
   operator()(initializer_list<_Ep> __il) const noexcept
   {
     return reverse_iterator<const _Ep*>(__il.end());
   }
 
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
     noexcept(noexcept(__c.rbegin())) -> decltype(__c.rbegin())
   {
     return __c.rbegin();
   }
 
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
     noexcept(noexcept(__c.rbegin())) -> decltype(__c.rbegin())
   {
     return __c.rbegin();
@@ -73,28 +72,27 @@ namespace __rend
 struct __fn
 {
   template <class _Tp, size_t _Np>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator<_Tp*>
-  operator()(_Tp (&__array)[_Np]) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator<_Tp*> operator()(_Tp (&__array)[_Np]) const noexcept
   {
     return reverse_iterator<_Tp*>(__array);
   }
 
   template <class _Ep>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator<const _Ep*>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator<const _Ep*>
   operator()(initializer_list<_Ep> __il) const noexcept
   {
     return reverse_iterator<const _Ep*>(__il.begin());
   }
 
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
     noexcept(noexcept(__c.rend())) -> decltype(__c.rend())
   {
     return __c.rend();
   }
 
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
     noexcept(noexcept(__c.rend())) -> decltype(__c.rend())
   {
     return __c.rend();
@@ -112,7 +110,7 @@ namespace __crbegin
 struct __fn
 {
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
     noexcept(noexcept(_CUDA_VSTD::rbegin(__c))) -> decltype(_CUDA_VSTD::rbegin(__c))
   {
     return _CUDA_VSTD::rbegin(__c);
@@ -130,7 +128,7 @@ namespace __crend
 struct __fn
 {
   template <class _Cp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
     noexcept(noexcept(_CUDA_VSTD::rend(__c))) -> decltype(_CUDA_VSTD::rend(__c))
   {
     return _CUDA_VSTD::rend(__c);
diff --git a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h
index 6cb6960960..d8c338839d 100644
--- a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h
@@ -75,7 +75,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Iter>
-class _LIBCUDACXX_TEMPLATE_VIS reverse_iterator
+class _CCCL_TYPE_VISIBILITY_DEFAULT reverse_iterator
 #if _CCCL_STD_VER <= 2014 || !defined(_LIBCUDACXX_ABI_NO_ITERATOR_BASES)
     : public iterator<typename iterator_traits<_Iter>::iterator_category,
                       typename iterator_traits<_Iter>::value_type,
@@ -119,19 +119,18 @@ class _LIBCUDACXX_TEMPLATE_VIS reverse_iterator
 #endif
 
 #ifndef _LIBCUDACXX_ABI_NO_ITERATOR_BASES
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator()
       : __t_()
       , current()
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit reverse_iterator(_Iter __x)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit reverse_iterator(_Iter __x)
       : __t_(__x)
       , current(__x)
   {}
 
   template <class _Up, class = __enable_if_t<!is_same<_Up, _Iter>::value && is_convertible<_Up const&, _Iter>::value>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  reverse_iterator(const reverse_iterator<_Up>& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator(const reverse_iterator<_Up>& __u)
       : __t_(__u.base())
       , current(__u.base())
   {}
@@ -139,49 +138,46 @@ class _LIBCUDACXX_TEMPLATE_VIS reverse_iterator
   template <class _Up,
             class = __enable_if_t<!is_same<_Up, _Iter>::value && is_convertible<_Up const&, _Iter>::value
                                   && is_assignable<_Iter&, _Up const&>::value>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator&
-  operator=(const reverse_iterator<_Up>& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator& operator=(const reverse_iterator<_Up>& __u)
   {
     __t_ = current = __u.base();
     return *this;
   }
 #else
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator()
       : current()
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit reverse_iterator(_Iter __x)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit reverse_iterator(_Iter __x)
       : current(__x)
   {}
 
   template <class _Up, class = __enable_if_t<!is_same<_Up, _Iter>::value && is_convertible<_Up const&, _Iter>::value>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  reverse_iterator(const reverse_iterator<_Up>& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator(const reverse_iterator<_Up>& __u)
       : current(__u.base())
   {}
 
   template <class _Up,
             class = __enable_if_t<!is_same<_Up, _Iter>::value && is_convertible<_Up const&, _Iter>::value
                                   && is_assignable<_Iter&, _Up const&>::value>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator&
-  operator=(const reverse_iterator<_Up>& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator& operator=(const reverse_iterator<_Up>& __u)
   {
     current = __u.base();
     return *this;
   }
 #endif
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Iter base() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Iter base() const
   {
     return current;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator*() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator*() const
   {
     _Iter __tmp = current;
     return *--__tmp;
   }
 
 #if _CCCL_STD_VER > 2017
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr pointer operator->() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer operator->() const
     requires is_pointer_v<_Iter> || requires(const _Iter __i) { __i.operator->(); }
   {
     if constexpr (is_pointer_v<_Iter>)
@@ -194,64 +190,59 @@ class _LIBCUDACXX_TEMPLATE_VIS reverse_iterator
     }
   }
 #else
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pointer operator->() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pointer operator->() const
   {
     return _CUDA_VSTD::addressof(operator*());
   }
 #endif // _CCCL_STD_VER > 2017
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator& operator++()
   {
     --current;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator operator++(int)
   {
     reverse_iterator __tmp(*this);
     --current;
     return __tmp;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator& operator--()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator& operator--()
   {
     ++current;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator operator--(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator operator--(int)
   {
     reverse_iterator __tmp(*this);
     ++current;
     return __tmp;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator
-  operator+(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator operator+(difference_type __n) const
   {
     return reverse_iterator(current - __n);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator&
-  operator+=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator& operator+=(difference_type __n)
   {
     current -= __n;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator
-  operator-(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator operator-(difference_type __n) const
   {
     return reverse_iterator(current + __n);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator&
-  operator-=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator& operator-=(difference_type __n)
   {
     current += __n;
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
-  operator[](difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator[](difference_type __n) const
   {
     return *(*this + __n);
   }
 
 #if _CCCL_STD_VER > 2014
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr iter_rvalue_reference_t<_Iter>
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr iter_rvalue_reference_t<_Iter>
   iter_move(const reverse_iterator& __i) noexcept(__noexcept_rev_iter_iter_move<_Iter>)
   {
     auto __tmp = __i.base();
@@ -260,7 +251,7 @@ class _LIBCUDACXX_TEMPLATE_VIS reverse_iterator
 
 #  if defined(_CCCL_COMPILER_MSVC_2017) // MSVC2017 cannot find _Iter otherwise
   template <class _Iter2, class _Iter1 = _Iter>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto
   iter_swap(const reverse_iterator<_Iter1>& __x,
             const reverse_iterator<_Iter2>& __y) noexcept(__noexcept_rev_iter_iter_swap<_Iter1, _Iter2>)
     _LIBCUDACXX_TRAILING_REQUIRES(void)(same_as<_Iter1, _Iter>&& indirectly_swappable<_Iter2, _Iter1>)
@@ -271,7 +262,7 @@ class _LIBCUDACXX_TEMPLATE_VIS reverse_iterator
   }
 #  else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv
   template <class _Iter2>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto
   iter_swap(const reverse_iterator& __x,
             const reverse_iterator<_Iter2>& __y) noexcept(__noexcept_rev_iter_iter_swap<_Iter, _Iter2>)
     _LIBCUDACXX_TRAILING_REQUIRES(void)(indirectly_swappable<_Iter2, _Iter>)
@@ -296,7 +287,7 @@ struct __is_reverse_iterator<reverse_iterator<_Iter>> : true_type
 {};
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator==(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
 #if _CCCL_STD_VER > 2017
   requires requires {
@@ -308,7 +299,7 @@ operator==(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>&
 }
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
 #if _CCCL_STD_VER > 2017
   requires requires {
@@ -320,7 +311,7 @@ operator<(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& _
 }
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator!=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
 #if _CCCL_STD_VER > 2017
   requires requires {
@@ -332,7 +323,7 @@ operator!=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>&
 }
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
 #if _CCCL_STD_VER > 2017
   requires requires {
@@ -344,7 +335,7 @@ operator>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& _
 }
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
 #if _CCCL_STD_VER > 2017
   requires requires {
@@ -356,7 +347,7 @@ operator>=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>&
 }
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
 #if _CCCL_STD_VER > 2017
   requires requires {
@@ -369,7 +360,7 @@ operator<=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>&
 
 #ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 template <class _Iter1, three_way_comparable_with<_Iter1> _Iter2>
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr compare_three_way_result_t<_Iter1, _Iter2>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr compare_three_way_result_t<_Iter1, _Iter2>
 operator<=>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
 {
   return __y.base() <=> __x.base();
@@ -377,14 +368,14 @@ operator<=>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>&
 #endif // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
 template <class _Iter1, class _Iter2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto
 operator-(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) -> decltype(__y.base() - __x.base())
 {
   return __y.base() - __x.base();
 }
 
 template <class _Iter>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator<_Iter>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator<_Iter>
 operator+(typename reverse_iterator<_Iter>::difference_type __n, const reverse_iterator<_Iter>& __x)
 {
   return reverse_iterator<_Iter>(__x.base() - __n);
@@ -398,7 +389,7 @@ inline constexpr bool disable_sized_sentinel_for<reverse_iterator<_Iter1>, rever
 
 #if _CCCL_STD_VER > 2011
 template <class _Iter>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reverse_iterator<_Iter> make_reverse_iterator(_Iter __i)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reverse_iterator<_Iter> make_reverse_iterator(_Iter __i)
 {
   return reverse_iterator<_Iter>(__i);
 }
@@ -447,25 +438,23 @@ class __unconstrained_reverse_iterator
   using difference_type = iter_difference_t<_Iter>;
   using reference       = iter_reference_t<_Iter>;
 
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator() = default;
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator(const __unconstrained_reverse_iterator&) =
-    default;
-  _LIBCUDACXX_HIDE_FROM_ABI
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __unconstrained_reverse_iterator(_Iter __iter)
+  _CCCL_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator()                                        = default;
+  _CCCL_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator(const __unconstrained_reverse_iterator&) = default;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __unconstrained_reverse_iterator(_Iter __iter)
       : __iter_(__iter)
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Iter base() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Iter base() const
   {
     return __iter_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr reference operator*() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference operator*() const
   {
     auto __tmp = __iter_;
     return *--__tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr pointer operator->() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer operator->() const
   {
     if constexpr (is_pointer_v<_Iter>)
     {
@@ -477,7 +466,7 @@ class __unconstrained_reverse_iterator
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr iter_rvalue_reference_t<_Iter>
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr iter_rvalue_reference_t<_Iter>
   iter_move(const __unconstrained_reverse_iterator& __i) noexcept(
     is_nothrow_copy_constructible_v<_Iter> && noexcept(_CUDA_VRANGES::iter_move(--declval<_Iter&>())))
   {
@@ -485,102 +474,97 @@ class __unconstrained_reverse_iterator
     return _CUDA_VRANGES::iter_move(--__tmp);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __unconstrained_reverse_iterator& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator++()
   {
     --__iter_;
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __unconstrained_reverse_iterator operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator++(int)
   {
     auto __tmp = *this;
     --__iter_;
     return __tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __unconstrained_reverse_iterator& operator--()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator--()
   {
     ++__iter_;
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __unconstrained_reverse_iterator operator--(int)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator--(int)
   {
     auto __tmp = *this;
     ++__iter_;
     return __tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __unconstrained_reverse_iterator&
-  operator+=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator+=(difference_type __n)
   {
     __iter_ -= __n;
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __unconstrained_reverse_iterator&
-  operator-=(difference_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator-=(difference_type __n)
   {
     __iter_ += __n;
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __unconstrained_reverse_iterator
-  operator+(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator+(difference_type __n) const
   {
     return __unconstrained_reverse_iterator(__iter_ - __n);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __unconstrained_reverse_iterator
-  operator-(difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator-(difference_type __n) const
   {
     return __unconstrained_reverse_iterator(__iter_ + __n);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr difference_type
-  operator-(const __unconstrained_reverse_iterator& __other) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr difference_type operator-(const __unconstrained_reverse_iterator& __other) const
   {
     return __other.__iter_ - __iter_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator[](difference_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator[](difference_type __n) const
   {
     return *(*this + __n);
   }
 
   // Deliberately unconstrained unlike the comparison functions in `reverse_iterator` -- see the class comment for the
   // rationale.
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator==(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs)
   {
     return __lhs.base() == __rhs.base();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator!=(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs)
   {
     return __lhs.base() != __rhs.base();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator<(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs)
   {
     return __lhs.base() > __rhs.base();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator>(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs)
   {
     return __lhs.base() < __rhs.base();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator<=(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs)
   {
     return __lhs.base() >= __rhs.base();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator>=(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs)
   {
     return __lhs.base() <= __rhs.base();
@@ -599,15 +583,14 @@ struct __unwrap_reverse_iter_impl
   using _UnwrappedIter  = decltype(__unwrap_iter_impl<_Iter>::__unwrap(_CUDA_VSTD::declval<_Iter>()));
   using _ReverseWrapper = _RevIter1<_RevIter2<_Iter>>;
 
-  static _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _ReverseWrapper
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr _ReverseWrapper
   __rewrap(_ReverseWrapper __orig_iter, _UnwrappedIter __unwrapped_iter)
   {
     return _ReverseWrapper(
       _RevIter2<_Iter>(__unwrap_iter_impl<_Iter>::__rewrap(__orig_iter.base().base(), __unwrapped_iter)));
   }
 
-  static _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _UnwrappedIter
-  __unwrap(_ReverseWrapper __i) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr _UnwrappedIter __unwrap(_ReverseWrapper __i) noexcept
   {
     return __unwrap_iter_impl<_Iter>::__unwrap(__i.base().base());
   }
@@ -616,9 +599,8 @@ struct __unwrap_reverse_iter_impl
 #ifdef _LIBCUDACXX_HAS_RANGES
 #  if _CCCL_STD_VER > 2014
 template <_CUDA_VRANGES::bidirectional_range _Range>
-_LIBCUDACXX_HIDE_FROM_ABI
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _CUDA_VRANGES::subrange<reverse_iterator<_CUDA_VRANGES::iterator_t<_Range>>,
-                                                                reverse_iterator<_CUDA_VRANGES::iterator_t<_Range>>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VRANGES::subrange<reverse_iterator<_CUDA_VRANGES::iterator_t<_Range>>,
+                                                            reverse_iterator<_CUDA_VRANGES::iterator_t<_Range>>>
 __reverse_range(_Range&& __range)
 {
   auto __first = _CUDA_VRANGES::begin(__range);
diff --git a/libcudacxx/include/cuda/std/__iterator/size.h b/libcudacxx/include/cuda/std/__iterator/size.h
index a728570fcd..37e28a944f 100644
--- a/libcudacxx/include/cuda/std/__iterator/size.h
+++ b/libcudacxx/include/cuda/std/__iterator/size.h
@@ -29,14 +29,13 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if _CCCL_STD_VER > 2011
 template <class _Cont>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto
-size(const _Cont& __c) noexcept(noexcept(__c.size())) -> decltype(__c.size())
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto size(const _Cont& __c) noexcept(noexcept(__c.size())) -> decltype(__c.size())
 {
   return __c.size();
 }
 
 template <class _Tp, size_t _Sz>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size(const _Tp (&)[_Sz]) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr size_t size(const _Tp (&)[_Sz]) noexcept
 {
   return _Sz;
 }
@@ -44,7 +43,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size(const _Tp (&)[_Sz]) noexcept
 
 #if _CCCL_STD_VER > 2017
 template <class _Cont>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto ssize(const _Cont& __c) noexcept(
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto ssize(const _Cont& __c) noexcept(
   noexcept(static_cast<common_type_t<ptrdiff_t, make_signed_t<decltype(__c.size())>>>(__c.size())))
   -> common_type_t<ptrdiff_t, make_signed_t<decltype(__c.size())>>
 {
@@ -56,7 +55,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr auto ssize(const _Cont& __c) noexcept(
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_GCC("-Wsign-conversion")
 template <class _Tp, ptrdiff_t _Sz>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr ptrdiff_t ssize(const _Tp (&)[_Sz]) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr ptrdiff_t ssize(const _Tp (&)[_Sz]) noexcept
 {
   return _Sz;
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h b/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h
index 2304d5a15b..d4e5504b69 100644
--- a/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h
+++ b/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h
@@ -44,7 +44,7 @@ struct __unreachable_base
 {
   _LIBCUDACXX_TEMPLATE(class _Iter)
   _LIBCUDACXX_REQUIRES(weakly_incrementable<_Iter>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_NODISCARD_FRIEND constexpr bool
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator==(const unreachable_sentinel_t&, const _Iter&) noexcept
   {
     return false;
@@ -52,21 +52,21 @@ struct __unreachable_base
 #  if _CCCL_STD_VER < 2020
   _LIBCUDACXX_TEMPLATE(class _Iter)
   _LIBCUDACXX_REQUIRES(weakly_incrementable<_Iter>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_NODISCARD_FRIEND constexpr bool
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator==(const _Iter&, const unreachable_sentinel_t&) noexcept
   {
     return false;
   }
   _LIBCUDACXX_TEMPLATE(class _Iter)
   _LIBCUDACXX_REQUIRES(weakly_incrementable<_Iter>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_NODISCARD_FRIEND constexpr bool
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator!=(const unreachable_sentinel_t&, const _Iter&) noexcept
   {
     return true;
   }
   _LIBCUDACXX_TEMPLATE(class _Iter)
   _LIBCUDACXX_REQUIRES(weakly_incrementable<_Iter>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_NODISCARD_FRIEND constexpr bool
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator!=(const _Iter&, const unreachable_sentinel_t&) noexcept
   {
     return true;
diff --git a/libcudacxx/include/cuda/std/__iterator/wrap_iter.h b/libcudacxx/include/cuda/std/__iterator/wrap_iter.h
index 90dfc41343..fa9c6358d7 100644
--- a/libcudacxx/include/cuda/std/__iterator/wrap_iter.h
+++ b/libcudacxx/include/cuda/std/__iterator/wrap_iter.h
@@ -57,13 +57,13 @@ class __wrap_iter
   iterator_type __i_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter() noexcept
       : __i_()
   {
     _CUDA_VSTD::__debug_db_insert_i(this);
   }
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   __wrap_iter(const __wrap_iter<_Up>& __u,
               typename enable_if<is_convertible<_Up, iterator_type>::value>::type* = nullptr) noexcept
       : __i_(__u.base())
@@ -76,7 +76,7 @@ class __wrap_iter
 #endif
   }
 #ifdef _LIBCUDACXX_ENABLE_DEBUG_MODE
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter(const __wrap_iter& __x)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter(const __wrap_iter& __x)
       : __i_(__x.base())
   {
     if (!__libcpp_is_constant_evaluated())
@@ -84,7 +84,7 @@ class __wrap_iter
       __get_db()->__iterator_copy(this, _CUDA_VSTD::addressof(__x));
     }
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator=(const __wrap_iter& __x)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator=(const __wrap_iter& __x)
   {
     if (this != _CUDA_VSTD::addressof(__x))
     {
@@ -96,7 +96,7 @@ class __wrap_iter
     }
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__wrap_iter()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__wrap_iter()
   {
     if (!__libcpp_is_constant_evaluated())
     {
@@ -104,82 +104,82 @@ class __wrap_iter
     }
   }
 #endif
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator*() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator*() const noexcept
   {
     _LIBCUDACXX_DEBUG_ASSERT(__get_const_db()->__dereferenceable(this),
                              "Attempted to dereference a non-dereferenceable iterator");
     return *__i_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pointer operator->() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pointer operator->() const noexcept
   {
     _LIBCUDACXX_DEBUG_ASSERT(__get_const_db()->__dereferenceable(this),
                              "Attempted to dereference a non-dereferenceable iterator");
     return _CUDA_VSTD::__to_address(__i_);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator++() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator++() noexcept
   {
     _LIBCUDACXX_DEBUG_ASSERT(__get_const_db()->__dereferenceable(this),
                              "Attempted to increment a non-incrementable iterator");
     ++__i_;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter operator++(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter operator++(int) noexcept
   {
     __wrap_iter __tmp(*this);
     ++(*this);
     return __tmp;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator--() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator--() noexcept
   {
     _LIBCUDACXX_DEBUG_ASSERT(__get_const_db()->__decrementable(this),
                              "Attempted to decrement a non-decrementable iterator");
     --__i_;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter operator--(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter operator--(int) noexcept
   {
     __wrap_iter __tmp(*this);
     --(*this);
     return __tmp;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter operator+(difference_type __n) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter operator+(difference_type __n) const noexcept
   {
     __wrap_iter __w(*this);
     __w += __n;
     return __w;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator+=(difference_type __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator+=(difference_type __n) noexcept
   {
     _LIBCUDACXX_DEBUG_ASSERT(__get_const_db()->__addable(this, __n),
                              "Attempted to add/subtract an iterator outside its valid range");
     __i_ += __n;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter operator-(difference_type __n) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter operator-(difference_type __n) const noexcept
   {
     return *this + (-__n);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator-=(difference_type __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter& operator-=(difference_type __n) noexcept
   {
     *this += -__n;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator[](difference_type __n) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator[](difference_type __n) const noexcept
   {
     _LIBCUDACXX_DEBUG_ASSERT(__get_const_db()->__subscriptable(this, __n),
                              "Attempted to subscript an iterator outside its valid range");
     return __i_[__n];
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator_type base() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator_type base() const noexcept
   {
     return __i_;
   }
 
 // private:
 #if _LIBCUDACXX_DEBUG_LEVEL >= 2
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_IF_NODEBUG __wrap_iter(const void* __p, iterator_type __x)
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_IF_NODEBUG __wrap_iter(const void* __p, iterator_type __x)
       : __i_(__x)
   {
     if (!__libcpp_is_constant_evaluated())
@@ -188,7 +188,7 @@ class __wrap_iter
     }
   }
 #else
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR_IF_NODEBUG __wrap_iter(iterator_type __x) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_IF_NODEBUG __wrap_iter(iterator_type __x) noexcept
       : __i_(__x)
   {}
 #endif
@@ -204,21 +204,21 @@ class __wrap_iter
 };
 
 template <class _Iter1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator==(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) noexcept
 {
   return __x.base() == __y.base();
 }
 
 template <class _Iter1, class _Iter2>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator==(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept
 {
   return __x.base() == __y.base();
 }
 
 template <class _Iter1>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) noexcept
 {
   _LIBCUDACXX_DEBUG_ASSERT(
@@ -228,7 +228,7 @@ operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) noexce
 }
 
 template <class _Iter1, class _Iter2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept
 {
   _LIBCUDACXX_DEBUG_ASSERT(__get_const_db()->__less_than_comparable(&__x, &__y),
@@ -237,63 +237,63 @@ operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexce
 }
 
 template <class _Iter1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) noexcept
 {
   return !(__x == __y);
 }
 
 template <class _Iter1, class _Iter2>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept
 {
   return !(__x == __y);
 }
 
 template <class _Iter1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) noexcept
 {
   return __y < __x;
 }
 
 template <class _Iter1, class _Iter2>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept
 {
   return __y < __x;
 }
 
 template <class _Iter1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator>=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) noexcept
 {
   return !(__x < __y);
 }
 
 template <class _Iter1, class _Iter2>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator>=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept
 {
   return !(__x < __y);
 }
 
 template <class _Iter1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator<=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) noexcept
 {
   return !(__y < __x);
 }
 
 template <class _Iter1, class _Iter2>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator<=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept
 {
   return !(__y < __x);
 }
 
 template <class _Iter1, class _Iter2>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 auto
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto
 operator-(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept -> decltype(__x.base() - __y.base())
 {
   _LIBCUDACXX_DEBUG_ASSERT(
@@ -303,7 +303,7 @@ operator-(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexce
 }
 
 template <class _Iter1>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __wrap_iter<_Iter1>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __wrap_iter<_Iter1>
 operator+(typename __wrap_iter<_Iter1>::difference_type __n, __wrap_iter<_Iter1> __x) noexcept
 {
   __x += __n;
@@ -317,13 +317,13 @@ struct __is_cpp17_contiguous_iterator<__wrap_iter<_It>> : true_type
 #endif
 
 template <class _It>
-struct _LIBCUDACXX_TEMPLATE_VIS pointer_traits<__wrap_iter<_It>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT pointer_traits<__wrap_iter<_It>>
 {
   typedef __wrap_iter<_It> pointer;
   typedef typename pointer_traits<_It>::element_type element_type;
   typedef typename pointer_traits<_It>::difference_type difference_type;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr static element_type* to_address(pointer __w) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr static element_type* to_address(pointer __w) noexcept
   {
     return _CUDA_VSTD::__to_address(__w.base());
   }
diff --git a/libcudacxx/include/cuda/std/__mdspan/compressed_pair.h b/libcudacxx/include/cuda/std/__mdspan/compressed_pair.h
index aefeed34df..3a1be7e0b8 100644
--- a/libcudacxx/include/cuda/std/__mdspan/compressed_pair.h
+++ b/libcudacxx/include/cuda/std/__mdspan/compressed_pair.h
@@ -92,20 +92,16 @@ struct __compressed_pair
     return __u_val;
   }
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair() noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair& operator=(__compressed_pair const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair& operator=(__compressed_pair&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__compressed_pair() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair() noexcept                         = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair(__compressed_pair const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair(__compressed_pair&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair&
+  operator=(__compressed_pair const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair&
+  operator=(__compressed_pair&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__compressed_pair() noexcept = default;
   template <class _TLike, class _ULike>
-  __MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike&& __t, _ULike&& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __compressed_pair(_TLike&& __t, _ULike&& __u)
       : __t_val((_TLike&&) __t)
       , __u_val((_ULike&&) __u)
   {}
@@ -139,20 +135,16 @@ struct __compressed_pair<
     return __u_val;
   }
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair() noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair& operator=(__compressed_pair const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair& operator=(__compressed_pair&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__compressed_pair() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair() noexcept                         = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair(__compressed_pair const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair(__compressed_pair&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair&
+  operator=(__compressed_pair const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair&
+  operator=(__compressed_pair&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__compressed_pair() noexcept = default;
   template <class _TLike, class _ULike>
-  __MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike&& __t, _ULike&& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __compressed_pair(_TLike&& __t, _ULike&& __u)
       : _Tp((_TLike&&) __t)
       , __u_val((_ULike&&) __u)
   {}
@@ -184,21 +176,17 @@ struct __compressed_pair<
     return *static_cast<_Up const*>(this);
   }
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair() noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair& operator=(__compressed_pair const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair& operator=(__compressed_pair&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__compressed_pair() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair() noexcept                         = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair(__compressed_pair const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair(__compressed_pair&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair&
+  operator=(__compressed_pair const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair&
+  operator=(__compressed_pair&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__compressed_pair() noexcept = default;
 
   template <class _TLike, class _ULike>
-  __MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike&& __t, _ULike&& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __compressed_pair(_TLike&& __t, _ULike&& __u)
       : _Up((_ULike&&) __u)
       , __t_val((_TLike&&) __t)
   {}
@@ -242,20 +230,16 @@ struct __compressed_pair<
     return this->__second_base_t::__ref();
   }
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair() noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair& operator=(__compressed_pair const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair& operator=(__compressed_pair&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__compressed_pair() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair() noexcept                         = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair(__compressed_pair const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __compressed_pair(__compressed_pair&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair&
+  operator=(__compressed_pair const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair&
+  operator=(__compressed_pair&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__compressed_pair() noexcept = default;
   template <class _TLike, class _ULike>
-  __MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike&& __t, _ULike&& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __compressed_pair(_TLike&& __t, _ULike&& __u) noexcept
       : __first_base_t(_Tp((_TLike&&) __t))
       , __second_base_t(_Up((_ULike&&) __u))
   {}
diff --git a/libcudacxx/include/cuda/std/__mdspan/default_accessor.h b/libcudacxx/include/cuda/std/__mdspan/default_accessor.h
index ea0924915d..777eb8c97a 100644
--- a/libcudacxx/include/cuda/std/__mdspan/default_accessor.h
+++ b/libcudacxx/include/cuda/std/__mdspan/default_accessor.h
@@ -70,14 +70,13 @@ struct default_accessor
   using reference        = _ElementType&;
   using data_handle_type = _ElementType*;
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr default_accessor() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr default_accessor() noexcept = default;
 
   _LIBCUDACXX_TEMPLATE(class _OtherElementType)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_convertible, _OtherElementType (*)[], element_type (*)[]))
-  __MDSPAN_INLINE_FUNCTION constexpr default_accessor(default_accessor<_OtherElementType>) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr default_accessor(default_accessor<_OtherElementType>) noexcept {}
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr data_handle_type offset(data_handle_type __p, size_t __i) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr data_handle_type offset(data_handle_type __p, size_t __i) const noexcept
   {
     return __p + __i;
   }
diff --git a/libcudacxx/include/cuda/std/__mdspan/extents.h b/libcudacxx/include/cuda/std/__mdspan/extents.h
index c8177542da..cb144785eb 100644
--- a/libcudacxx/include/cuda/std/__mdspan/extents.h
+++ b/libcudacxx/include/cuda/std/__mdspan/extents.h
@@ -179,7 +179,7 @@ class extents
   friend class extents;
 
   template <class _OtherIndexType, size_t... _OtherExtents, size_t... _Idxs>
-  __MDSPAN_INLINE_FUNCTION constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   _eq_impl(_CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>,
            false_type,
            _CUDA_VSTD::index_sequence<_Idxs...>) const noexcept
@@ -187,7 +187,7 @@ class extents
     return false;
   }
   template <class _OtherIndexType, size_t... _OtherExtents, size_t... _Idxs>
-  __MDSPAN_INLINE_FUNCTION constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   _eq_impl(_CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...> __other,
            true_type,
            _CUDA_VSTD::index_sequence<_Idxs...>) const noexcept
@@ -198,7 +198,7 @@ class extents
   }
 
   template <class _OtherIndexType, size_t... _OtherExtents, size_t... _Idxs>
-  __MDSPAN_INLINE_FUNCTION constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   _not_eq_impl(_CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>,
                false_type,
                _CUDA_VSTD::index_sequence<_Idxs...>) const noexcept
@@ -206,7 +206,7 @@ class extents
     return true;
   }
   template <class _OtherIndexType, size_t... _OtherExtents, size_t... _Idxs>
-  __MDSPAN_INLINE_FUNCTION constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   _not_eq_impl(_CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...> __other,
                true_type,
                _CUDA_VSTD::index_sequence<_Idxs...>) const noexcept
@@ -217,7 +217,7 @@ class extents
   }
 
 #  ifdef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
-  __MDSPAN_INLINE_FUNCTION constexpr explicit extents(__base_t&& __b) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit extents(__base_t&& __b) noexcept
       : __base_t(_CUDA_VSTD::move(__b))
   {}
 #  endif
@@ -230,13 +230,11 @@ class extents
   using index_type = _ThisIndexType;
   */
 
-  __MDSPAN_INLINE_FUNCTION
-  static constexpr rank_type rank() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr rank_type rank() noexcept
   {
     return sizeof...(_Extents);
   }
-  __MDSPAN_INLINE_FUNCTION
-  static constexpr rank_type rank_dynamic() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr rank_type rank_dynamic() noexcept
   {
     return __MDSPAN_FOLD_PLUS_RIGHT((rank_type(_Extents == dynamic_extent)), /* + ... + */ 0);
   }
@@ -245,7 +243,7 @@ class extents
   // Constructors, Destructors, and Assignment
 
   // Default constructor
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr extents() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr extents() noexcept = default;
 
   // Converting constructor
   _LIBCUDACXX_TEMPLATE(class _OtherIndexType, size_t... _OtherExtents)
@@ -255,11 +253,10 @@ class extents
       integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
       _CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
       _CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value))
-  __MDSPAN_INLINE_FUNCTION
-  __MDSPAN_CONDITIONAL_EXPLICIT(
+  _LIBCUDACXX_HIDE_FROM_ABI __MDSPAN_CONDITIONAL_EXPLICIT(
     (((_Extents != dynamic_extent) && (_OtherExtents == dynamic_extent)) || ...)
-    || (_CUDA_VSTD::numeric_limits<index_type>::max() < _CUDA_VSTD::numeric_limits<_OtherIndexType>::max()))
-  constexpr extents(const extents<_OtherIndexType, _OtherExtents...>& __other) noexcept
+    || (_CUDA_VSTD::numeric_limits<index_type>::max() < _CUDA_VSTD::numeric_limits<
+          _OtherIndexType>::max())) constexpr extents(const extents<_OtherIndexType, _OtherExtents...>& __other) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __storage_{
 #  else
@@ -302,8 +299,7 @@ class extents
         _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */)
         _LIBCUDACXX_AND((sizeof...(_Integral) == rank_dynamic()) || (sizeof...(_Integral) == rank())))
 #  endif
-  __MDSPAN_INLINE_FUNCTION
-  explicit constexpr extents(_Integral... __exts) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr extents(_Integral... __exts) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __storage_{
 #  else
@@ -346,8 +342,7 @@ class extents
                            _LIBCUDACXX_AND(_Np == rank() || _Np == rank_dynamic()))
 #  endif
   __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic())
-  __MDSPAN_INLINE_FUNCTION
-  constexpr extents(_CUDA_VSTD::array<_IndexType, _Np> const& __exts) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr extents(_CUDA_VSTD::array<_IndexType, _Np> const& __exts) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __storage_{
 #  else
@@ -390,8 +385,7 @@ class extents
                            _LIBCUDACXX_AND(_Np == rank() || _Np == rank_dynamic()))
 #  endif
   __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic())
-  __MDSPAN_INLINE_FUNCTION
-  constexpr extents(_CUDA_VSTD::span<_IndexType, _Np> __exts) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr extents(_CUDA_VSTD::span<_IndexType, _Np> __exts) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __storage_{
 #  else
@@ -420,8 +414,7 @@ class extents
 
   // Need this constructor for some submdspan implementation stuff
   // for the layout_stride case where I use an extents object for strides
-  __MDSPAN_INLINE_FUNCTION
-  constexpr explicit extents(__storage_t const& __sto) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit extents(__storage_t const& __sto) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __storage_{
 #  else
@@ -437,16 +430,14 @@ class extents
 
   //--------------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION
-  static constexpr size_t static_extent(size_t __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t static_extent(size_t __n) noexcept
   {
     // Can't do assert here since that breaks true constexpr ness
     // assert(__n<rank());
     return _static_extent_impl(__n, _CUDA_VSTD::make_integer_sequence<size_t, sizeof...(_Extents)>{});
   }
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr index_type extent(size_t __n) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type extent(size_t __n) const noexcept
   {
     // Can't do assert here since that breaks true constexpr ness
     // assert(__n<rank());
@@ -456,7 +447,7 @@ class extents
   //--------------------------------------------------------------------------------
 
   template <class _OtherIndexType, size_t... _RHS>
-  __MDSPAN_INLINE_FUNCTION friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator==(extents const& lhs, extents<_OtherIndexType, _RHS...> const& __rhs) noexcept
   {
     return lhs._eq_impl(__rhs,
@@ -466,7 +457,7 @@ class extents
 
 #  if !(__MDSPAN_HAS_CXX_20)
   template <class _OtherIndexType, size_t... _RHS>
-  __MDSPAN_INLINE_FUNCTION friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator!=(extents const& lhs, extents<_OtherIndexType, _RHS...> const& __rhs) noexcept
   {
     return lhs._not_eq_impl(__rhs,
@@ -478,7 +469,7 @@ class extents
   // End of public interface
 
 public: // (but not really)
-  __MDSPAN_INLINE_FUNCTION static constexpr extents
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr extents
   __make_extents_impl(__detail::__partially_static_sizes<index_type, size_t, _Extents...>&& __bs) noexcept
   {
     // This effectively amounts to a sideways cast that can be done in a constexpr
@@ -503,7 +494,7 @@ class extents
   }
 
   template <size_t _Np, size_t _Default = dynamic_extent>
-  __MDSPAN_INLINE_FUNCTION static constexpr index_type __static_extent() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr index_type __static_extent() noexcept
   {
     return __storage_t::template __get_static_n<_Np, _Default>();
   }
diff --git a/libcudacxx/include/cuda/std/__mdspan/full_extent_t.h b/libcudacxx/include/cuda/std/__mdspan/full_extent_t.h
index 72164676e6..556464a0fd 100644
--- a/libcudacxx/include/cuda/std/__mdspan/full_extent_t.h
+++ b/libcudacxx/include/cuda/std/__mdspan/full_extent_t.h
@@ -62,7 +62,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 struct full_extent_t
 {
-  explicit full_extent_t() = default;
+  _CCCL_HIDE_FROM_ABI explicit full_extent_t() = default;
 };
 
 _LIBCUDACXX_INLINE_VAR constexpr auto full_extent = full_extent_t{};
diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_left.h b/libcudacxx/include/cuda/std/__mdspan/layout_left.h
index 9d0842515c..be598ebbf7 100644
--- a/libcudacxx/include/cuda/std/__mdspan/layout_left.h
+++ b/libcudacxx/include/cuda/std/__mdspan/layout_left.h
@@ -114,8 +114,8 @@ class layout_left::mapping
 public:
   //--------------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept               = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr mapping() noexcept               = default;
+  _CCCL_HIDE_FROM_ABI constexpr mapping(mapping const&) noexcept = default;
 
   _CCCL_HOST_DEVICE constexpr mapping(extents_type const& __exts) noexcept
       : __extents(__exts)
@@ -125,7 +125,7 @@ class layout_left::mapping
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due
                                                                                                    // to comma
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     mapping<_OtherExtents> const& __other) noexcept // NOLINT(google-explicit-constructor)
       : __extents(__other.extents())
   {
@@ -140,7 +140,7 @@ class layout_left::mapping
                          _LIBCUDACXX_AND(extents_type::rank() <= 1))
   __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due
                                                                                                    // to comma
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     layout_right::mapping<_OtherExtents> const& __other) noexcept // NOLINT(google-explicit-constructor)
       : __extents(__other.extents())
   {
@@ -153,7 +153,7 @@ class layout_left::mapping
   _LIBCUDACXX_TEMPLATE(class _OtherExtents)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     layout_stride::mapping<_OtherExtents> const& __other) // NOLINT(google-explicit-constructor)
       : __extents(__other.extents())
   {
@@ -168,17 +168,14 @@ class layout_left::mapping
                  }))
   }
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED __MDSPAN_CONSTEXPR_14_DEFAULTED mapping&
-  operator=(mapping const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED mapping& operator=(mapping const&) noexcept = default;
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr const extents_type& extents() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const extents_type& extents() const noexcept
   {
     return __extents;
   }
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr index_type required_span_size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type required_span_size() const noexcept
   {
     index_type __value = 1;
     for (rank_type __r = 0; __r < extents_type::rank(); __r++)
@@ -200,36 +197,35 @@ class layout_left::mapping
     return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast<index_type>(__idxs)...);
   }
 
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_unique() noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_exhaustive() noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_strided() noexcept
   {
     return true;
   }
 
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_unique() const noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_exhaustive() const noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_strided() const noexcept
   {
     return true;
   }
 
   _LIBCUDACXX_TEMPLATE(class _Ext = _Extents)
   _LIBCUDACXX_REQUIRES((_Ext::rank() > 0))
-  __MDSPAN_INLINE_FUNCTION
-  constexpr index_type stride(rank_type __i) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type stride(rank_type __i) const noexcept
   {
     index_type __value = 1;
     for (rank_type __r = 0; __r < __i; __r++)
@@ -240,7 +236,7 @@ class layout_left::mapping
   }
 
   template <class _OtherExtents>
-  __MDSPAN_INLINE_FUNCTION friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator==(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
   {
     return __lhs.extents() == __rhs.extents();
@@ -249,7 +245,7 @@ class layout_left::mapping
   // In C++ 20 the not equal exists if equal is found
 #  if !(__MDSPAN_HAS_CXX_20)
   template <class _OtherExtents>
-  __MDSPAN_INLINE_FUNCTION friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator!=(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
   {
     return __lhs.extents() != __rhs.extents();
diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_right.h b/libcudacxx/include/cuda/std/__mdspan/layout_right.h
index 4dfd4a1e38..5c2c51d7c4 100644
--- a/libcudacxx/include/cuda/std/__mdspan/layout_right.h
+++ b/libcudacxx/include/cuda/std/__mdspan/layout_right.h
@@ -119,8 +119,8 @@ class layout_right::mapping
 public:
   //--------------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept               = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr mapping() noexcept               = default;
+  _CCCL_HIDE_FROM_ABI constexpr mapping(mapping const&) noexcept = default;
 
   _CCCL_HOST_DEVICE constexpr mapping(extents_type const& __exts) noexcept
       : __extents(__exts)
@@ -130,7 +130,7 @@ class layout_right::mapping
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due
                                                                                                    // to comma
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     mapping<_OtherExtents> const& __other) noexcept // NOLINT(google-explicit-constructor)
       : __extents(__other.extents())
   {
@@ -145,7 +145,7 @@ class layout_right::mapping
                          _LIBCUDACXX_AND(extents_type::rank() <= 1))
   __MDSPAN_CONDITIONAL_EXPLICIT((!_CUDA_VSTD::is_convertible<_OtherExtents, extents_type>::value)) // needs two () due
                                                                                                    // to comma
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     layout_left::mapping<_OtherExtents> const& __other) noexcept // NOLINT(google-explicit-constructor)
       : __extents(__other.extents())
   {
@@ -158,7 +158,7 @@ class layout_right::mapping
   _LIBCUDACXX_TEMPLATE(class _OtherExtents)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_constructible, extents_type, _OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     layout_stride::mapping<_OtherExtents> const& __other) // NOLINT(google-explicit-constructor)
       : __extents(__other.extents())
   {
@@ -173,17 +173,14 @@ class layout_right::mapping
                  }))
   }
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED __MDSPAN_CONSTEXPR_14_DEFAULTED mapping&
-  operator=(mapping const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED mapping& operator=(mapping const&) noexcept = default;
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr const extents_type& extents() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const extents_type& extents() const noexcept
   {
     return __extents;
   }
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr index_type required_span_size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type required_span_size() const noexcept
   {
     index_type __value = 1;
     for (rank_type __r = 0; __r != extents_type::rank(); ++__r)
@@ -204,35 +201,34 @@ class layout_right::mapping
     return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast<index_type>(__idxs)...);
   }
 
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_unique() noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_exhaustive() noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_strided() noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_unique() const noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_exhaustive() const noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_strided() const noexcept
   {
     return true;
   }
 
   _LIBCUDACXX_TEMPLATE(class _Ext = _Extents)
   _LIBCUDACXX_REQUIRES((_Ext::rank() > 0))
-  __MDSPAN_INLINE_FUNCTION
-  constexpr index_type stride(rank_type __i) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type stride(rank_type __i) const noexcept
   {
     index_type __value = 1;
     for (rank_type __r = extents_type::rank() - 1; __r > __i; __r--)
@@ -243,7 +239,7 @@ class layout_right::mapping
   }
 
   template <class _OtherExtents>
-  __MDSPAN_INLINE_FUNCTION friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator==(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
   {
     return __lhs.extents() == __rhs.extents();
@@ -252,7 +248,7 @@ class layout_right::mapping
   // In C++ 20 the not equal exists if equal is found
 #  if !(__MDSPAN_HAS_CXX_20)
   template <class _OtherExtents>
-  __MDSPAN_INLINE_FUNCTION friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator!=(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
   {
     return __lhs.extents() != __rhs.extents();
diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h
index d0a1ecad4b..99254e955e 100644
--- a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h
+++ b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h
@@ -182,14 +182,14 @@ struct layout_stride
     struct __deduction_workaround<_CUDA_VSTD::index_sequence<_Idxs...>>
     {
       template <class _OtherExtents>
-      __MDSPAN_INLINE_FUNCTION static constexpr bool
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool
       _eq_impl(mapping const& __self, mapping<_OtherExtents> const& __other) noexcept
       {
         return __MDSPAN_FOLD_AND((__self.stride(_Idxs) == __other.stride(_Idxs)) /* && ... */)
             && __MDSPAN_FOLD_AND((__self.extents().extent(_Idxs) == __other.extents().extent(_Idxs)) /* || ... */);
       }
       template <class _OtherExtents>
-      __MDSPAN_INLINE_FUNCTION static constexpr bool
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool
       _not_eq_impl(mapping const& __self, mapping<_OtherExtents> const& __other) noexcept
       {
         return __MDSPAN_FOLD_OR((__self.stride(_Idxs) != __other.stride(_Idxs)) /* || ... */)
@@ -203,54 +203,51 @@ struct layout_stride
         return __MDSPAN_FOLD_PLUS_RIGHT((__idxs * __self.stride(_Idxs)), /* + ... + */ 0);
       }
 
-      __MDSPAN_INLINE_FUNCTION
-      static constexpr size_t _req_span_size_impl(mapping const& __self) noexcept
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t _req_span_size_impl(mapping const& __self) noexcept
       {
         // assumes no negative strides; not sure if I'm allowed to assume that or not
         return __impl::_call_op_impl(__self, (__self.extents().template __extent<_Idxs>() - 1)...) + 1;
       }
 
       template <class _OtherMapping>
-      __MDSPAN_INLINE_FUNCTION static constexpr const __strides_storage_t fill_strides(const _OtherMapping& __map)
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr const __strides_storage_t fill_strides(const _OtherMapping& __map)
       {
         return __strides_storage_t{static_cast<index_type>(__map.stride(_Idxs))...};
       }
 
-      __MDSPAN_INLINE_FUNCTION
-      static constexpr const __strides_storage_t& fill_strides(const __strides_storage_t& __s)
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr const __strides_storage_t& fill_strides(const __strides_storage_t& __s)
       {
         return __s;
       }
 
       template <class _IntegralType>
-      __MDSPAN_INLINE_FUNCTION static constexpr const __strides_storage_t
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr const __strides_storage_t
       fill_strides(const _CUDA_VSTD::array<_IntegralType, extents_type::rank()>& __s)
       {
         return __strides_storage_t{static_cast<index_type>(__s[_Idxs])...};
       }
 
       template <class _IntegralType>
-      __MDSPAN_INLINE_FUNCTION static constexpr const __strides_storage_t
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr const __strides_storage_t
       fill_strides(const _CUDA_VSTD::span<_IntegralType, extents_type::rank()>& __s)
       {
         return __strides_storage_t{static_cast<index_type>(__s[_Idxs])...};
       }
 
-      __MDSPAN_INLINE_FUNCTION
-      static constexpr const __strides_storage_t fill_strides(
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr const __strides_storage_t fill_strides(
         __detail::__extents_to_partially_static_sizes_t<_CUDA_VSTD::dextents<index_type, extents_type::rank()>>&& __s)
       {
         return __strides_storage_t{static_cast<index_type>(__s.template __get_n<_Idxs>())...};
       }
 
       template <size_t K>
-      __MDSPAN_INLINE_FUNCTION static constexpr size_t __return_zero()
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t __return_zero()
       {
         return 0;
       }
 
       template <class _Mapping>
-      __MDSPAN_INLINE_FUNCTION static constexpr typename _Mapping::index_type __OFFSET(const _Mapping& m)
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr typename _Mapping::index_type __OFFSET(const _Mapping& m)
       {
         return m(__return_zero<_Idxs>()...);
       }
@@ -262,18 +259,17 @@ struct layout_stride
     //----------------------------------------------------------------------------
 
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
-    __MDSPAN_INLINE_FUNCTION constexpr explicit mapping(__member_pair_t&& __m)
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit mapping(__member_pair_t&& __m)
         : __members(_CUDA_VSTD::move(__m))
     {}
 #  else
-    __MDSPAN_INLINE_FUNCTION constexpr explicit mapping(__base_t&& __b)
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit mapping(__base_t&& __b)
         : __base_t(_CUDA_VSTD::move(__b))
     {}
 #  endif
 
   public: // but not really
-    __MDSPAN_INLINE_FUNCTION
-    static constexpr mapping __make_mapping(
+    _LIBCUDACXX_HIDE_FROM_ABI static constexpr mapping __make_mapping(
       __detail::__extents_to_partially_static_sizes_t<_Extents>&& __exts,
       __detail::__extents_to_partially_static_sizes_t<_CUDA_VSTD::dextents<index_type, _Extents::rank()>>&&
         __strs) noexcept
@@ -295,15 +291,15 @@ struct layout_stride
   public:
     //--------------------------------------------------------------------------------
 
-    __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept               = default;
-    __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default;
+    _CCCL_HIDE_FROM_ABI constexpr mapping() noexcept               = default;
+    _CCCL_HIDE_FROM_ABI constexpr mapping(mapping const&) noexcept = default;
 
     // nvcc cannot deduce this constructor when using _LIBCUDACXX_REQUIRES
     template <
       class _IntegralTypes,
       enable_if_t<_CCCL_TRAIT(is_convertible, const remove_const_t<_IntegralTypes>&, index_type), int>           = 0,
       enable_if_t<_CCCL_TRAIT(is_nothrow_constructible, index_type, const remove_const_t<_IntegralTypes>&), int> = 0>
-    __MDSPAN_INLINE_FUNCTION constexpr mapping(
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
       extents_type const& __e, _CUDA_VSTD::array<_IntegralTypes, extents_type::rank()> const& __s) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
         : __members{
@@ -332,7 +328,7 @@ struct layout_stride
       class _IntegralTypes,
       enable_if_t<_CCCL_TRAIT(is_convertible, const remove_const_t<_IntegralTypes>&, index_type), int>           = 0,
       enable_if_t<_CCCL_TRAIT(is_nothrow_constructible, index_type, const remove_const_t<_IntegralTypes>&), int> = 0>
-    __MDSPAN_INLINE_FUNCTION constexpr mapping(
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
       extents_type const& __e, _CUDA_VSTD::span<_IntegralTypes, extents_type::rank()> const& __s) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
         : __members{
@@ -374,8 +370,8 @@ struct layout_stride
       && (__detail::__is_mapping_of<layout_left, _StridedLayoutMapping>
           || __detail::__is_mapping_of<layout_right, _StridedLayoutMapping>
           || __detail::__is_mapping_of<layout_stride, _StridedLayoutMapping>) ) // needs two () due to comma
-    __MDSPAN_INLINE_FUNCTION
-    constexpr mapping(_StridedLayoutMapping const& __other) noexcept // NOLINT(google-explicit-constructor)
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
+      _StridedLayoutMapping const& __other) noexcept // NOLINT(google-explicit-constructor)
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
         : __members{
 #  else
@@ -398,10 +394,9 @@ struct layout_stride
 
     //--------------------------------------------------------------------------------
 
-    __MDSPAN_INLINE_FUNCTION_DEFAULTED __MDSPAN_CONSTEXPR_14_DEFAULTED mapping&
-    operator=(mapping const&) noexcept = default;
+    _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED mapping& operator=(mapping const&) noexcept = default;
 
-    __MDSPAN_INLINE_FUNCTION constexpr const extents_type& extents() const noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr const extents_type& extents() const noexcept
     {
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       return __members.__first();
@@ -410,14 +405,12 @@ struct layout_stride
 #  endif
     };
 
-    __MDSPAN_INLINE_FUNCTION
-    constexpr _CUDA_VSTD::array<index_type, extents_type::rank()> strides() const noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::array<index_type, extents_type::rank()> strides() const noexcept
     {
       return __strides_storage();
     }
 
-    __MDSPAN_INLINE_FUNCTION
-    constexpr index_type required_span_size() const noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type required_span_size() const noexcept
     {
       index_type __span_size = 1;
       for (unsigned __r = 0; __r < extents_type::rank(); __r++)
@@ -445,36 +438,35 @@ struct layout_stride
       return static_cast<index_type>(__impl::_call_op_impl(*this, static_cast<index_type>(__idxs)...));
     }
 
-    __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_unique() noexcept
     {
       return true;
     }
-    __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_exhaustive() noexcept
     {
       return false;
     }
-    __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_strided() noexcept
     {
       return true;
     }
 
-    __MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_unique() noexcept
     {
       return true;
     }
-    __MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_exhaustive() const noexcept
     {
       return required_span_size() == __get_size(extents(), _CUDA_VSTD::make_index_sequence<extents_type::rank()>());
     }
-    __MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_strided() noexcept
     {
       return true;
     }
 
     _LIBCUDACXX_TEMPLATE(class _Ext = _Extents)
     _LIBCUDACXX_REQUIRES((_Ext::rank() > 0))
-    __MDSPAN_INLINE_FUNCTION
-    constexpr index_type stride(rank_type __r) const noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type stride(rank_type __r) const noexcept
     {
       return __strides_storage()[__r];
     }
@@ -486,15 +478,13 @@ struct layout_stride
         extents_type::rank() == _StridedLayoutMapping::extents_type::rank())
         _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_strided()))
 #  else
-    template<class _StridedLayoutMapping>
-    requires(
-         __detail::__layout_mapping_alike<_StridedLayoutMapping> &&
-         (extents_type::rank() == _StridedLayoutMapping::extents_type::rank()) &&
-         _StridedLayoutMapping::is_always_strided()
-    )
+    template <class _StridedLayoutMapping>
+      requires(__detail::__layout_mapping_alike<_StridedLayoutMapping>
+               && (extents_type::rank() == _StridedLayoutMapping::extents_type::rank())
+               && _StridedLayoutMapping::is_always_strided())
 #  endif
-    __MDSPAN_INLINE_FUNCTION
-    friend constexpr bool operator==(const mapping& __x, const _StridedLayoutMapping& __y) noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
+    operator==(const mapping& __x, const _StridedLayoutMapping& __y) noexcept
     {
       bool __strides_match = true;
       for (rank_type __r = 0; __r < extents_type::rank(); __r++)
@@ -508,7 +498,7 @@ struct layout_stride
     // This one is not technically part of the proposal. Just here to make implementation a bit more optimal hopefully
     _LIBCUDACXX_TEMPLATE(class _OtherExtents)
     _LIBCUDACXX_REQUIRES((extents_type::rank() == _OtherExtents::rank()))
-    __MDSPAN_INLINE_FUNCTION friend constexpr bool
+    _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
     operator==(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
     {
       return __impl::_eq_impl(__lhs, __rhs);
@@ -520,7 +510,7 @@ struct layout_stride
       __detail::__is_mapping_of<typename _StridedLayoutMapping::layout_type, _StridedLayoutMapping> _LIBCUDACXX_AND(
         extents_type::rank() == _StridedLayoutMapping::extents_type::rank())
         _LIBCUDACXX_AND(_StridedLayoutMapping::is_always_strided()))
-    __MDSPAN_INLINE_FUNCTION friend constexpr bool
+    _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
     operator!=(const mapping& __x, const _StridedLayoutMapping& __y) noexcept
     {
       return not(__x == __y);
@@ -528,7 +518,7 @@ struct layout_stride
 
     _LIBCUDACXX_TEMPLATE(class _OtherExtents)
     _LIBCUDACXX_REQUIRES((extents_type::rank() == _OtherExtents::rank()))
-    __MDSPAN_INLINE_FUNCTION friend constexpr bool
+    _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
     operator!=(mapping const& __lhs, mapping<_OtherExtents> const& __rhs) noexcept
     {
       return __impl::_not_eq_impl(__lhs, __rhs);
diff --git a/libcudacxx/include/cuda/std/__mdspan/macros.h b/libcudacxx/include/cuda/std/__mdspan/macros.h
index 0aa54e0330..831bfa45a3 100644
--- a/libcudacxx/include/cuda/std/__mdspan/macros.h
+++ b/libcudacxx/include/cuda/std/__mdspan/macros.h
@@ -71,15 +71,6 @@
 #    endif
 #  endif
 
-#  ifndef __MDSPAN_INLINE_FUNCTION
-#    define __MDSPAN_INLINE_FUNCTION inline _CCCL_HOST_DEVICE
-#  endif
-
-// In CUDA defaulted functions do not need host device markup
-#  ifndef __MDSPAN_INLINE_FUNCTION_DEFAULTED
-#    define __MDSPAN_INLINE_FUNCTION_DEFAULTED
-#  endif
-
 //==============================================================================
 // <editor-fold desc="Preprocessor helpers"> {{{1
 
@@ -664,12 +655,12 @@ _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _CCCL_STD_VER > 2011
 
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
 #  define _LIBCUDACXX_THROW_RUNTIME_ERROR(_COND, _MESSAGE) \
     if (!(_COND))                                          \
     __throw_runtime_error(_MESSAGE)
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
 #  define _LIBCUDACXX_THROW_RUNTIME_ERROR(_COND, _MESSAGE) _LIBCUDACXX_ASSERT(_COND, _MESSAGE)
-#endif
+#endif // _CCCL_NO_EXCEPTIONS
 
 #endif // _LIBCUDACXX___MDSPAN_MACROS_HPP
diff --git a/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h b/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h
index 52e6ef8101..0a8d2696b9 100644
--- a/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h
+++ b/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h
@@ -96,21 +96,16 @@ struct __maybe_static_value
 
   //--------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __maybe_static_value() noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __maybe_static_value(__maybe_static_value const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __maybe_static_value(__maybe_static_value&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __maybe_static_value& operator=(__maybe_static_value const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __maybe_static_value& operator=(__maybe_static_value&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__maybe_static_value() noexcept = default;
-
-  __MDSPAN_INLINE_FUNCTION
-  constexpr explicit __maybe_static_value(_dynamic_t const&) noexcept
+  _CCCL_HIDE_FROM_ABI constexpr __maybe_static_value() noexcept                            = default;
+  _CCCL_HIDE_FROM_ABI constexpr __maybe_static_value(__maybe_static_value const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __maybe_static_value(__maybe_static_value&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __maybe_static_value&
+  operator=(__maybe_static_value const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __maybe_static_value&
+  operator=(__maybe_static_value&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__maybe_static_value() noexcept = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __maybe_static_value(_dynamic_t const&) noexcept
   {
     // Should we assert that the value matches the static value here?
   }
diff --git a/libcudacxx/include/cuda/std/__mdspan/mdspan.h b/libcudacxx/include/cuda/std/__mdspan/mdspan.h
index 27e6a57a94..e4df2f4486 100644
--- a/libcudacxx/include/cuda/std/__mdspan/mdspan.h
+++ b/libcudacxx/include/cuda/std/__mdspan/mdspan.h
@@ -135,19 +135,19 @@ class mdspan
   using data_handle_type = typename accessor_type::data_handle_type;
   using reference        = typename accessor_type::reference;
 
-  __MDSPAN_INLINE_FUNCTION static constexpr size_t rank() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t rank() noexcept
   {
     return extents_type::rank();
   }
-  __MDSPAN_INLINE_FUNCTION static constexpr size_t rank_dynamic() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t rank_dynamic() noexcept
   {
     return extents_type::rank_dynamic();
   }
-  __MDSPAN_INLINE_FUNCTION static constexpr size_t static_extent(size_t __r) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t static_extent(size_t __r) noexcept
   {
     return extents_type::static_extent(__r);
   }
-  __MDSPAN_INLINE_FUNCTION constexpr index_type extent(size_t __r) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type extent(size_t __r) const noexcept
   {
     return __mapping_ref().extents().extent(__r);
   };
@@ -163,9 +163,9 @@ class mdspan
   // [mdspan.basic.cons], mdspan constructors, assignment, and destructor
 
 #  if !__MDSPAN_HAS_CXX_20
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan() = default;
+  _CCCL_HIDE_FROM_ABI constexpr mdspan() = default;
 #  else
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan()
+  _CCCL_HIDE_FROM_ABI constexpr mdspan()
     requires(
               // Directly using rank_dynamic()>0 here doesn't work for nvcc
               (extents_type::rank_dynamic() > 0) && _CCCL_TRAIT(is_default_constructible, data_handle_type)
@@ -173,8 +173,8 @@ class mdspan
               && _CCCL_TRAIT(is_default_constructible, accessor_type))
   = default;
 #  endif
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan(const mdspan&) = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan(mdspan&&)      = default;
+  _CCCL_HIDE_FROM_ABI constexpr mdspan(const mdspan&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr mdspan(mdspan&&)      = default;
 
   _LIBCUDACXX_TEMPLATE(class... _SizeTypes)
   _LIBCUDACXX_REQUIRES(
@@ -183,8 +183,7 @@ class mdspan
         _LIBCUDACXX_AND((sizeof...(_SizeTypes) == rank()) || (sizeof...(_SizeTypes) == rank_dynamic()))
           _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type)
             _LIBCUDACXX_AND _CCCL_TRAIT(is_default_constructible, accessor_type))
-  __MDSPAN_INLINE_FUNCTION
-  explicit constexpr mdspan(data_handle_type __p, _SizeTypes... __dynamic_extents)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr mdspan(data_handle_type __p, _SizeTypes... __dynamic_extents)
       // TODO @proposal-bug shouldn't I be allowed to do `move(__p)` here?
       : __members(
           _CUDA_VSTD::move(__p),
@@ -198,8 +197,8 @@ class mdspan
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type)
                            _LIBCUDACXX_AND _CCCL_TRAIT(is_default_constructible, accessor_type))
   __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic())
-  __MDSPAN_INLINE_FUNCTION
-  constexpr mdspan(data_handle_type __p, const _CUDA_VSTD::array<_SizeType, _Np>& __dynamic_extents)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p,
+                                             const _CUDA_VSTD::array<_SizeType, _Np>& __dynamic_extents)
       : __members(_CUDA_VSTD::move(__p),
                   __map_acc_pair_t(mapping_type(extents_type(__dynamic_extents)), accessor_type()))
   {}
@@ -210,8 +209,7 @@ class mdspan
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type)
                            _LIBCUDACXX_AND _CCCL_TRAIT(is_default_constructible, accessor_type))
   __MDSPAN_CONDITIONAL_EXPLICIT(_Np != rank_dynamic())
-  __MDSPAN_INLINE_FUNCTION
-  constexpr mdspan(data_handle_type __p, _CUDA_VSTD::span<_SizeType, _Np> __dynamic_extents)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p, _CUDA_VSTD::span<_SizeType, _Np> __dynamic_extents)
       : __members(_CUDA_VSTD::move(__p),
                   __map_acc_pair_t(mapping_type(extents_type(_CUDA_VSTD::as_const(__dynamic_extents))), accessor_type()))
   {}
@@ -219,18 +217,17 @@ class mdspan
   _LIBCUDACXX_TEMPLATE(bool _Is_default_constructible = _CCCL_TRAIT(is_default_constructible, accessor_type))
   _LIBCUDACXX_REQUIRES(
     _Is_default_constructible _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type))
-  __MDSPAN_INLINE_FUNCTION constexpr mdspan(data_handle_type __p, const extents_type& __exts)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p, const extents_type& __exts)
       : __members(_CUDA_VSTD::move(__p), __map_acc_pair_t(mapping_type(__exts), accessor_type()))
   {}
 
   _LIBCUDACXX_TEMPLATE(bool _Is_default_constructible = _CCCL_TRAIT(is_default_constructible, accessor_type))
   _LIBCUDACXX_REQUIRES(_Is_default_constructible)
-  __MDSPAN_INLINE_FUNCTION constexpr mdspan(data_handle_type __p, const mapping_type& __m)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p, const mapping_type& __m)
       : __members(_CUDA_VSTD::move(__p), __map_acc_pair_t(__m, accessor_type()))
   {}
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr mdspan(data_handle_type __p, const mapping_type& __m, const accessor_type& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p, const mapping_type& __m, const accessor_type& __a)
       : __members(_CUDA_VSTD::move(__p), __map_acc_pair_t(__m, __a))
   {}
 
@@ -238,8 +235,8 @@ class mdspan
   _LIBCUDACXX_REQUIRES(
     _CCCL_TRAIT(is_constructible, mapping_type, typename _OtherLayoutPolicy::template mapping<_OtherExtents>)
       _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, accessor_type, _OtherAccessor))
-  __MDSPAN_INLINE_FUNCTION
-  constexpr mdspan(const mdspan<_OtherElementType, _OtherExtents, _OtherLayoutPolicy, _OtherAccessor>& __other)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mdspan(
+    const mdspan<_OtherElementType, _OtherExtents, _OtherLayoutPolicy, _OtherAccessor>& __other)
       : __members(__other.__ptr_ref(), __map_acc_pair_t(__other.__mapping_ref(), __other.__accessor_ref()))
   {
     static_assert(_CCCL_TRAIT(is_constructible, data_handle_type, typename _OtherAccessor::data_handle_type),
@@ -254,12 +251,12 @@ class mdspan
   }
 
   /* Might need this on NVIDIA?
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~mdspan() = default;
+  _CCCL_HIDE_FROM_ABI
+  _CCCL_HIDE_FROM_ABI ~mdspan() = default;
   */
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED __MDSPAN_CONSTEXPR_14_DEFAULTED mdspan& operator=(const mdspan&) = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED __MDSPAN_CONSTEXPR_14_DEFAULTED mdspan& operator=(mdspan&&)      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED mdspan& operator=(const mdspan&) = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED mdspan& operator=(mdspan&&)      = default;
 
   //--------------------------------------------------------------------------------
   // [mdspan.basic.mapping], mdspan mapping domain multidimensional index to access codomain element
@@ -337,18 +334,17 @@ class mdspan
   }
 #  endif // __MDSPAN_USE_PAREN_OPERATOR
 
-  __MDSPAN_INLINE_FUNCTION constexpr size_t size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t size() const noexcept
   {
     return __impl::__size(*this);
   };
 
-  __MDSPAN_INLINE_FUNCTION constexpr bool empty() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const noexcept
   {
     return __impl::__empty(*this);
   };
 
-  __MDSPAN_INLINE_FUNCTION
-  friend constexpr void swap(mdspan& __x, mdspan& __y) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr void swap(mdspan& __x, mdspan& __y) noexcept
   {
     swap(__x.__ptr_ref(), __y.__ptr_ref());
     swap(__x.__mapping_ref(), __y.__mapping_ref());
@@ -358,19 +354,19 @@ class mdspan
   //--------------------------------------------------------------------------------
   // [mdspan.basic.domobs], mdspan observers of the domain multidimensional index space
 
-  __MDSPAN_INLINE_FUNCTION constexpr const extents_type& extents() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const extents_type& extents() const noexcept
   {
     return __mapping_ref().extents();
   };
-  __MDSPAN_INLINE_FUNCTION constexpr const data_handle_type& data_handle() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const data_handle_type& data_handle() const noexcept
   {
     return __ptr_ref();
   };
-  __MDSPAN_INLINE_FUNCTION constexpr const mapping_type& mapping() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const mapping_type& mapping() const noexcept
   {
     return __mapping_ref();
   };
-  __MDSPAN_INLINE_FUNCTION constexpr const accessor_type& accessor() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const accessor_type& accessor() const noexcept
   {
     return __accessor_ref();
   };
@@ -378,32 +374,32 @@ class mdspan
   //--------------------------------------------------------------------------------
   // [mdspan.basic.obs], mdspan observers of the mapping
 
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_unique() noexcept
   {
     return mapping_type::is_always_unique();
   };
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_exhaustive() noexcept
   {
     return mapping_type::is_always_exhaustive();
   };
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_strided() noexcept
   {
     return mapping_type::is_always_strided();
   };
 
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_unique() const noexcept
   {
     return __mapping_ref().is_unique();
   };
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_exhaustive() const noexcept
   {
     return __mapping_ref().is_exhaustive();
   };
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_strided() const noexcept
   {
     return __mapping_ref().is_strided();
   };
-  __MDSPAN_INLINE_FUNCTION constexpr index_type stride(size_t __r) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type stride(size_t __r) const
   {
     return __mapping_ref().stride(__r);
   };
diff --git a/libcudacxx/include/cuda/std/__mdspan/no_unique_address.h b/libcudacxx/include/cuda/std/__mdspan/no_unique_address.h
index 2249c7fbb7..8d0d315b12 100644
--- a/libcudacxx/include/cuda/std/__mdspan/no_unique_address.h
+++ b/libcudacxx/include/cuda/std/__mdspan/no_unique_address.h
@@ -119,29 +119,21 @@ struct __no_unique_address_emulation<_Tp,
     return *static_cast<_Tp*>(this);
   }
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __no_unique_address_emulation() noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __no_unique_address_emulation(__no_unique_address_emulation const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __no_unique_address_emulation(__no_unique_address_emulation&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __no_unique_address_emulation&
+  _CCCL_HIDE_FROM_ABI constexpr __no_unique_address_emulation() noexcept                                     = default;
+  _CCCL_HIDE_FROM_ABI constexpr __no_unique_address_emulation(__no_unique_address_emulation const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __no_unique_address_emulation(__no_unique_address_emulation&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __no_unique_address_emulation&
   operator=(__no_unique_address_emulation const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __no_unique_address_emulation&
-  operator=(__no_unique_address_emulation&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__no_unique_address_emulation() noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __no_unique_address_emulation&
+  operator=(__no_unique_address_emulation&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__no_unique_address_emulation() noexcept = default;
 
   // Explicitly make this not a reference so that the copy or move
   // constructor still gets called.
-  __MDSPAN_INLINE_FUNCTION
-  explicit constexpr __no_unique_address_emulation(_Tp const& __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __no_unique_address_emulation(_Tp const& __v) noexcept
       : _Tp(__v)
   {}
-  __MDSPAN_INLINE_FUNCTION
-  explicit constexpr __no_unique_address_emulation(_Tp&& __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __no_unique_address_emulation(_Tp&& __v) noexcept
       : _Tp(_CUDA_VSTD::move(__v))
   {}
 };
diff --git a/libcudacxx/include/cuda/std/__mdspan/standard_layout_static_array.h b/libcudacxx/include/cuda/std/__mdspan/standard_layout_static_array.h
index 64440bfacc..dda423e036 100644
--- a/libcudacxx/include/cuda/std/__mdspan/standard_layout_static_array.h
+++ b/libcudacxx/include/cuda/std/__mdspan/standard_layout_static_array.h
@@ -177,25 +177,21 @@ struct __standard_layout_psa<_Tag,
 
   //--------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa() noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa& operator=(__standard_layout_psa const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa& operator=(__standard_layout_psa&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__standard_layout_psa() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __standard_layout_psa() noexcept                             = default;
+  _CCCL_HIDE_FROM_ABI constexpr __standard_layout_psa(__standard_layout_psa const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __standard_layout_psa(__standard_layout_psa&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa&
+  operator=(__standard_layout_psa const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa&
+  operator=(__standard_layout_psa&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__standard_layout_psa() noexcept = default;
 
   //--------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr __standard_layout_psa(__construct_psa_from_all_exts_values_tag_t,
-                                  _Tp const& /*__val*/,
-                                  __repeated_with_idxs<_Idxs, _Tp> const&... __vals) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __standard_layout_psa(
+    __construct_psa_from_all_exts_values_tag_t,
+    _Tp const& /*__val*/,
+    __repeated_with_idxs<_Idxs, _Tp> const&... __vals) noexcept
 #    ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __next_{
 #    else
@@ -210,7 +206,7 @@ struct __standard_layout_psa<_Tag,
   {}
 
   template <class... _Ts>
-  __MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __standard_layout_psa(
     __construct_psa_from_dynamic_exts_values_tag_t, _Ts const&... __vals) noexcept
 #    ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __next_{
@@ -226,7 +222,7 @@ struct __standard_layout_psa<_Tag,
   {}
 
   template <class _Up, size_t _Np>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(array<_Up, _Np> const& __vals) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(array<_Up, _Np> const& __vals) noexcept
 #    ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __next_{
 #    else
@@ -241,7 +237,7 @@ struct __standard_layout_psa<_Tag,
   {}
 
   template <class _Up, size_t _NStatic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_all_exts_array_tag_t const& __tag, array<_Up, _NStatic> const& __vals) noexcept
 #    ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __next_{
@@ -257,7 +253,7 @@ struct __standard_layout_psa<_Tag,
   {}
 
   template <class _Up, size_t _IDynamic, size_t _NDynamic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic> __tag, array<_Up, _NDynamic> const& __vals) noexcept
 #    ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __next_{
@@ -273,7 +269,7 @@ struct __standard_layout_psa<_Tag,
   {}
 
   template <class _Up, size_t _Np>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(_CUDA_VSTD::span<_Up, _Np> const& __vals) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(_CUDA_VSTD::span<_Up, _Np> const& __vals) noexcept
 #    ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __next_{
 #    else
@@ -288,7 +284,7 @@ struct __standard_layout_psa<_Tag,
   {}
 
   template <class _Up, size_t _NStatic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_all_exts_array_tag_t const& __tag, _CUDA_VSTD::span<_Up, _NStatic> const& __vals) noexcept
 #    ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __next_{
@@ -304,7 +300,7 @@ struct __standard_layout_psa<_Tag,
   {}
 
   template <class _Up, size_t _IDynamic, size_t _NDynamic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic> __tag,
     _CUDA_VSTD::span<_Up, _NDynamic> const& __vals) noexcept
 #    ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
@@ -321,7 +317,7 @@ struct __standard_layout_psa<_Tag,
   {}
 
   template <class _UTag, class _Up, class _static_U, class _UValsSeq, _static_U __u_sentinal, class _IdxsSeq>
-  __MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __standard_layout_psa(
     __standard_layout_psa<_UTag, _Up, _static_U, _UValsSeq, __u_sentinal, _IdxsSeq> const& __rhs) noexcept
 #    ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
       : __next_{
@@ -430,72 +426,68 @@ struct __standard_layout_psa<_Tag,
 
   //--------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa() noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa& operator=(__standard_layout_psa const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa& operator=(__standard_layout_psa&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__standard_layout_psa() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __standard_layout_psa() noexcept                             = default;
+  _CCCL_HIDE_FROM_ABI constexpr __standard_layout_psa(__standard_layout_psa const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __standard_layout_psa(__standard_layout_psa&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa&
+  operator=(__standard_layout_psa const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa&
+  operator=(__standard_layout_psa&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__standard_layout_psa() noexcept = default;
 
   //--------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr __standard_layout_psa(__construct_psa_from_all_exts_values_tag_t,
-                                  _Tp const& __val,
-                                  __repeated_with_idxs<_Idxs, _Tp> const&... __vals) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __standard_layout_psa(
+    __construct_psa_from_all_exts_values_tag_t,
+    _Tp const& __val,
+    __repeated_with_idxs<_Idxs, _Tp> const&... __vals) noexcept
       : __value_pair(__val, __next_t(__construct_psa_from_all_exts_values_tag, __vals...))
   {}
 
   template <class... _Ts>
-  __MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __standard_layout_psa(
     __construct_psa_from_dynamic_exts_values_tag_t, _Tp const& __val, _Ts const&... __vals) noexcept
       : __value_pair(__val, __next_t(__construct_psa_from_dynamic_exts_values_tag, __vals...))
   {}
 
   template <class _Up, size_t _Np>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(array<_Up, _Np> const& __vals) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(array<_Up, _Np> const& __vals) noexcept
       : __value_pair(_CUDA_VSTD::get<_Idx>(__vals), __vals)
   {}
 
   template <class _Up, size_t _NStatic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_all_exts_array_tag_t __tag, array<_Up, _NStatic> const& __vals) noexcept
       : __value_pair(_CUDA_VSTD::get<_Idx>(__vals), __next_t(__tag, __vals))
   {}
 
   template <class _Up, size_t _IDynamic, size_t _NDynamic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic>, array<_Up, _NDynamic> const& __vals) noexcept
       : __value_pair(_CUDA_VSTD::get<_IDynamic>(__vals),
                      __next_t(__construct_psa_from_dynamic_exts_array_tag_t<_IDynamic + 1>{}, __vals))
   {}
 
   template <class _Up, size_t _Np>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(_CUDA_VSTD::span<_Up, _Np> const& __vals) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(_CUDA_VSTD::span<_Up, _Np> const& __vals) noexcept
       : __value_pair(__vals[_Idx], __vals)
   {}
 
   template <class _Up, size_t _NStatic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_all_exts_array_tag_t __tag, _CUDA_VSTD::span<_Up, _NStatic> const& __vals) noexcept
       : __value_pair(__vals[_Idx], __next_t(__tag, __vals))
   {}
 
   template <class _Up, size_t _IDynamic, size_t _NDynamic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic>, _CUDA_VSTD::span<_Up, _NDynamic> const& __vals) noexcept
       : __value_pair(__vals[_IDynamic],
                      __next_t(__construct_psa_from_dynamic_exts_array_tag_t<_IDynamic + 1>{}, __vals))
   {}
 
   template <class _UTag, class _Up, class _static_U, class _UValsSeq, _static_U __u_sentinal, class _UIdxsSeq>
-  __MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __standard_layout_psa(
     __standard_layout_psa<_UTag, _Up, _static_U, _UValsSeq, __u_sentinal, _UIdxsSeq> const& __rhs) noexcept
       : __value_pair(__rhs.template __get_n<_Idx>(), __rhs.__next())
   {}
@@ -564,56 +556,51 @@ struct __standard_layout_psa<_Tag,
 
   //--------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa() noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa& operator=(__standard_layout_psa const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa& operator=(__standard_layout_psa&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__standard_layout_psa() noexcept = default;
-
-  __MDSPAN_INLINE_FUNCTION
-  constexpr __standard_layout_psa(__construct_psa_from_all_exts_values_tag_t) noexcept {}
+  _CCCL_HIDE_FROM_ABI constexpr __standard_layout_psa() noexcept                             = default;
+  _CCCL_HIDE_FROM_ABI constexpr __standard_layout_psa(__standard_layout_psa const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __standard_layout_psa(__standard_layout_psa&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa&
+  operator=(__standard_layout_psa const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa&
+  operator=(__standard_layout_psa&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__standard_layout_psa() noexcept = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __standard_layout_psa(__construct_psa_from_all_exts_values_tag_t) noexcept {}
 
   template <class... _Ts>
-  __MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(__construct_psa_from_dynamic_exts_values_tag_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __standard_layout_psa(__construct_psa_from_dynamic_exts_values_tag_t) noexcept
   {}
 
   template <class _Up, size_t _Np>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(array<_Up, _Np> const&) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(array<_Up, _Np> const&) noexcept
   {}
 
   template <class _Up, size_t _NStatic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_all_exts_array_tag_t, array<_Up, _NStatic> const&) noexcept
   {}
 
   template <class _Up, size_t _IDynamic, size_t _NDynamic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic>, array<_Up, _NDynamic> const&) noexcept
   {}
 
   template <class _Up, size_t _Np>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(_CUDA_VSTD::span<_Up, _Np> const&) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(_CUDA_VSTD::span<_Up, _Np> const&) noexcept
   {}
 
   template <class _Up, size_t _NStatic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_all_exts_array_tag_t, _CUDA_VSTD::span<_Up, _NStatic> const&) noexcept
   {}
 
   template <class _Up, size_t _IDynamic, size_t _NDynamic>
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __standard_layout_psa(
     __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic>, _CUDA_VSTD::span<_Up, _NDynamic> const&) noexcept
   {}
 
   template <class _UTag, class _Up, class _static_U, class _UValsSeq, _static_U __u_sentinal, class _UIdxsSeq>
-  __MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __standard_layout_psa(
     __standard_layout_psa<_UTag, _Up, _static_U, _UValsSeq, __u_sentinal, _UIdxsSeq> const&) noexcept
   {}
 
@@ -650,7 +637,7 @@ struct __partially_static_sizes_tagged
   using __psa_impl_t::__psa_impl_t;
 #    endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
 #    ifdef __MDSPAN_DEFAULTED_CONSTRUCTORS_INHERITANCE_WORKAROUND
-  __MDSPAN_INLINE_FUNCTION
+  _LIBCUDACXX_HIDE_FROM_ABI
 #    endif
   constexpr __partially_static_sizes_tagged() noexcept
 #    ifdef __MDSPAN_DEFAULTED_CONSTRUCTORS_INHERITANCE_WORKAROUND
@@ -660,18 +647,16 @@ struct __partially_static_sizes_tagged
 #    endif
       // This line seems to be unstable under clang-format
       // clang-format off
-      __MDSPAN_INLINE_FUNCTION_DEFAULTED
-      constexpr __partially_static_sizes_tagged(__partially_static_sizes_tagged const&) noexcept = default;
+      _CCCL_HIDE_FROM_ABI constexpr __partially_static_sizes_tagged(__partially_static_sizes_tagged const&) noexcept = default;
   //clang-format on
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __partially_static_sizes_tagged(__partially_static_sizes_tagged&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
+  _CCCL_HIDE_FROM_ABI constexpr __partially_static_sizes_tagged(__partially_static_sizes_tagged&&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI
   __MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_sizes_tagged&
   operator=(__partially_static_sizes_tagged const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
+  _CCCL_HIDE_FROM_ABI
   __MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_sizes_tagged&
   operator=(__partially_static_sizes_tagged&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
+  _CCCL_HIDE_FROM_ABI
   ~__partially_static_sizes_tagged() noexcept = default;
 
   template <class _UTag>
@@ -706,7 +691,7 @@ struct __partially_static_sizes : __partially_static_sizes_tagged<__no_tag, T, _
 #    endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
 
 #    ifdef __MDSPAN_DEFAULTED_CONSTRUCTORS_INHERITANCE_WORKAROUND
-  __MDSPAN_INLINE_FUNCTION
+  _LIBCUDACXX_HIDE_FROM_ABI
   constexpr __partially_static_sizes() noexcept
       : __base_t()
   {}
diff --git a/libcudacxx/include/cuda/std/__mdspan/static_array.h b/libcudacxx/include/cuda/std/__mdspan/static_array.h
index 886f782065..476ac5de42 100644
--- a/libcudacxx/include/cuda/std/__mdspan/static_array.h
+++ b/libcudacxx/include/cuda/std/__mdspan/static_array.h
@@ -134,41 +134,34 @@ class __partially_static_array_impl<
 
   //--------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __partially_static_array_impl() = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __partially_static_array_impl(__partially_static_array_impl const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __partially_static_array_impl(__partially_static_array_impl&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_array_impl&
+  _CCCL_HIDE_FROM_ABI constexpr __partially_static_array_impl()                                              = default;
+  _CCCL_HIDE_FROM_ABI constexpr __partially_static_array_impl(__partially_static_array_impl const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr __partially_static_array_impl(__partially_static_array_impl&&) noexcept      = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_array_impl&
   operator=(__partially_static_array_impl const&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  __MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_array_impl&
-  operator=(__partially_static_array_impl&&) noexcept = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__partially_static_array_impl() noexcept = default;
-
-  __MDSPAN_INLINE_FUNCTION
-  constexpr __partially_static_array_impl(__construct_psa_from_all_exts_values_tag_t,
-                                          __repeated_with_idxs<_Idxs, _Tp> const&... __vals) noexcept
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_array_impl&
+  operator=(__partially_static_array_impl&&) noexcept           = default;
+  _CCCL_HIDE_FROM_ABI ~__partially_static_array_impl() noexcept = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __partially_static_array_impl(
+    __construct_psa_from_all_exts_values_tag_t, __repeated_with_idxs<_Idxs, _Tp> const&... __vals) noexcept
       : __base_n<_Idxs>(__base_n<_Idxs>{{__vals}})...
   {}
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr __partially_static_array_impl(__construct_psa_from_dynamic_exts_values_tag_t,
-                                          __repeated_with_idxs<_IdxsDynamicIdxs, _Tp> const&... __vals) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __partially_static_array_impl(
+    __construct_psa_from_dynamic_exts_values_tag_t,
+    __repeated_with_idxs<_IdxsDynamicIdxs, _Tp> const&... __vals) noexcept
       : __base_n<_IdxsDynamic>(__base_n<_IdxsDynamic>{{__vals}})...
   {}
 
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __partially_static_array_impl(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __partially_static_array_impl(
     array<_Tp, sizeof...(_Idxs)> const& __vals) noexcept
       : __partially_static_array_impl(__construct_psa_from_all_exts_values_tag, _CUDA_VSTD::get<_Idxs>(__vals)...)
   {}
 
   _LIBCUDACXX_TEMPLATE(bool _SizeMatches = (sizeof...(_Idxs) != __size_dynamic))
   _LIBCUDACXX_REQUIRES(_SizeMatches)
-  __MDSPAN_INLINE_FUNCTION constexpr explicit __partially_static_array_impl(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __partially_static_array_impl(
     array<_Tp, __size_dynamic> const& __vals) noexcept
     __partially_static_array_impl(__construct_psa_from_dynamic_exts_values_tag,
                                   _CUDA_VSTD::get<_IdxsDynamicIdxs>(__vals)...)
@@ -181,7 +174,7 @@ class __partially_static_array_impl<
             class _UIdxsSeq,
             class _UIdxsDynamicSeq,
             class _UIdxsDynamicIdxsSeq>
-  __MDSPAN_INLINE_FUNCTION constexpr __partially_static_array_impl(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __partially_static_array_impl(
     __partially_static_array_impl<_Up,
                                   _static_u,
                                   _UValsSeq,
@@ -263,7 +256,7 @@ class __partially_static_array_with_sentinal
 
 public:
 #    if defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_CUDACC_BELOW_11_3)
-  constexpr __partially_static_array_with_sentinal() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __partially_static_array_with_sentinal() = default;
 
   template <class... _Args>
   __MDSPAN_FORCE_INLINE_FUNCTION constexpr __partially_static_array_with_sentinal(_Args&&... __args) noexcept(
@@ -291,7 +284,7 @@ struct __partially_static_sizes
 
 public:
 #    if defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_CUDACC_BELOW_11_3)
-  constexpr __partially_static_sizes() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __partially_static_sizes() = default;
 
   template <class... _Args>
   __MDSPAN_FORCE_INLINE_FUNCTION constexpr __partially_static_sizes(_Args&&... __args) noexcept(
diff --git a/libcudacxx/include/cuda/std/__mdspan/submdspan.h b/libcudacxx/include/cuda/std/__mdspan/submdspan.h
index eb513daeab..92d79209e2 100644
--- a/libcudacxx/include/cuda/std/__mdspan/submdspan.h
+++ b/libcudacxx/include/cuda/std/__mdspan/submdspan.h
@@ -90,14 +90,14 @@ struct __slice_wrap
 //--------------------------------------------------------------------------------
 
 template <size_t _OldExtent, size_t _OldStaticStride>
-__MDSPAN_INLINE_FUNCTION constexpr __slice_wrap<_OldExtent, _OldStaticStride, size_t>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __slice_wrap<_OldExtent, _OldStaticStride, size_t>
 __wrap_slice(size_t __val, size_t __ext, size_t __stride)
 {
   return {__val, __ext, __stride};
 }
 
 template <size_t _OldExtent, size_t _OldStaticStride, class _IntegerType, _IntegerType _Value0>
-__MDSPAN_INLINE_FUNCTION constexpr __slice_wrap<_OldExtent, _OldStaticStride, integral_constant<_IntegerType, _Value0>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __slice_wrap<_OldExtent, _OldStaticStride, integral_constant<_IntegerType, _Value0>>
 __wrap_slice(size_t __val, size_t __ext, integral_constant<_IntegerType, _Value0> __stride)
 {
 #  if __MDSPAN_HAS_CXX_17
@@ -111,7 +111,7 @@ __wrap_slice(size_t __val, size_t __ext, integral_constant<_IntegerType, _Value0
 }
 
 template <size_t _OldExtent, size_t _OldStaticStride>
-__MDSPAN_INLINE_FUNCTION constexpr __slice_wrap<_OldExtent, _OldStaticStride, full_extent_t>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __slice_wrap<_OldExtent, _OldStaticStride, full_extent_t>
 __wrap_slice(full_extent_t __val, size_t __ext, size_t __stride)
 {
   return {__val, __ext, __stride};
@@ -119,7 +119,7 @@ __wrap_slice(full_extent_t __val, size_t __ext, size_t __stride)
 
 // TODO generalize this to anything that works with get<0> and get<1>
 template <size_t _OldExtent, size_t _OldStaticStride>
-__MDSPAN_INLINE_FUNCTION constexpr __slice_wrap<_OldExtent, _OldStaticStride, _CUDA_VSTD::tuple<size_t, size_t>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __slice_wrap<_OldExtent, _OldStaticStride, _CUDA_VSTD::tuple<size_t, size_t>>
 __wrap_slice(_CUDA_VSTD::tuple<size_t, size_t> const& __val, size_t __ext, size_t __stride)
 {
   return {__val, __ext, __stride};
@@ -131,7 +131,7 @@ template <size_t _OldExtent,
           _IntegerType0 _Value0,
           class _IntegerType1,
           _IntegerType1 _Value1>
-__MDSPAN_INLINE_FUNCTION constexpr __slice_wrap<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __slice_wrap<
   _OldExtent,
   _OldStaticStride,
   _CUDA_VSTD::tuple<integral_constant<_IntegerType0, _Value0>, integral_constant<_IntegerType1, _Value1>>>
@@ -262,12 +262,12 @@ struct __assign_op_slice_handler<
 
 #  ifdef __INTEL_COMPILER
 #    if __INTEL_COMPILER <= 1800
-  __MDSPAN_INLINE_FUNCTION constexpr __assign_op_slice_handler(__assign_op_slice_handler&& __other) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __assign_op_slice_handler(__assign_op_slice_handler&& __other) noexcept
       : __offsets(_CUDA_VSTD::move(__other.__offsets))
       , __exts(_CUDA_VSTD::move(__other.__exts))
       , __strides(_CUDA_VSTD::move(__other.__strides))
   {}
-  __MDSPAN_INLINE_FUNCTION constexpr __assign_op_slice_handler(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __assign_op_slice_handler(
     __offsets_storage_t&& __o, __extents_storage_t&& __e, __strides_storage_t&& __s) noexcept
       : __offsets(_CUDA_VSTD::move(__o))
       , __exts(_CUDA_VSTD::move(__e))
@@ -422,7 +422,7 @@ struct __assign_op_slice_handler<
 
   // TODO noexcept specification
   template <class NewLayout>
-  __MDSPAN_INLINE_FUNCTION __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
+  _LIBCUDACXX_HIDE_FROM_ABI __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
     (constexpr /* auto */
      _make_layout_mapping_impl(NewLayout) noexcept),
     (
@@ -431,7 +431,7 @@ struct __assign_op_slice_handler<
         extents<_IndexT, _Exts...>::__make_extents_impl(_CUDA_VSTD::move(__exts))) /* ; */
       ))
 
-    __MDSPAN_INLINE_FUNCTION __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
+    _LIBCUDACXX_HIDE_FROM_ABI __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
       (constexpr /* auto */
        _make_layout_mapping_impl(layout_stride) noexcept),
       (
@@ -440,7 +440,7 @@ struct __assign_op_slice_handler<
         ))
 
       template <class _OldLayoutMapping> // mostly for deferred instantiation, but maybe we'll use this in the future
-      __MDSPAN_INLINE_FUNCTION __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
+      _LIBCUDACXX_HIDE_FROM_ABI __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
         (constexpr /* auto */
          make_layout_mapping(_OldLayoutMapping const&) noexcept),
         (
@@ -453,7 +453,7 @@ struct __assign_op_slice_handler<
 #  if __MDSPAN_USE_RETURN_TYPE_DEDUCTION
 // Forking this because the C++11 version will be *completely* unreadable
 template <class _ET, class _ST, size_t... _Exts, class _LP, class _AP, class... _SliceSpecs, size_t... _Idxs>
-__MDSPAN_INLINE_FUNCTION constexpr auto _submdspan_impl(
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto _submdspan_impl(
   _CUDA_VSTD::integer_sequence<size_t, _Idxs...>,
   mdspan<_ET, _CUDA_VSTD::extents<_ST, _Exts...>, _LP, _AP> const& __src,
   _SliceSpecs&&... __slices) noexcept
@@ -490,7 +490,7 @@ auto _submdspan_impl_helper(_Src&& __src, _Handled&& __h, _CUDA_VSTD::integer_se
 }
 
 template <class _ET, class _ST, size_t... _Exts, class _LP, class _AP, class... _SliceSpecs, size_t... _Idxs>
-__MDSPAN_INLINE_FUNCTION __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
+_LIBCUDACXX_HIDE_FROM_ABI __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
   (constexpr /* auto */ _submdspan_impl(_CUDA_VSTD::integer_sequence<size_t, _Idxs...> __seq,
                                         mdspan<_ET, _CUDA_VSTD::extents<_ST, _Exts...>, _LP, _AP> const& __src,
                                         _SliceSpecs&&... __slices) noexcept),
@@ -530,8 +530,7 @@ _LIBCUDACXX_REQUIRES(
        || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, tuple<size_t, size_t>)
        || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, full_extent_t)) /* && ... */)
       _LIBCUDACXX_AND(sizeof...(_SliceSpecs) == _EXT::rank()))
-__MDSPAN_INLINE_FUNCTION
-__MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
+_LIBCUDACXX_HIDE_FROM_ABI __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
   (constexpr submdspan(mdspan<_ET, _EXT, _LP, _AP> const& __src, _SliceSpecs... __slices) noexcept),
   (
     /* return */
@@ -541,6 +540,6 @@ __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
 
 #endif // _CCCL_STD_VER > 2011
 
-_LIBCUDACXX_END_NAMESPACE_STD
+  _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___MDSPAN_SUBMDSPAN_HPP
diff --git a/libcudacxx/include/cuda/std/__memory/addressof.h b/libcudacxx/include/cuda/std/__memory/addressof.h
index 28fa5ef12b..2bc4617e00 100644
--- a/libcudacxx/include/cuda/std/__memory/addressof.h
+++ b/libcudacxx/include/cuda/std/__memory/addressof.h
@@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_ADDRESSOF)
 
 template <class _Tp>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_NO_CFI _LIBCUDACXX_INLINE_VISIBILITY _Tp* addressof(_Tp& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_NO_CFI _Tp* addressof(_Tp& __x) noexcept
 {
   return __builtin_addressof(__x);
 }
@@ -36,7 +36,7 @@ inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_NO_CFI _LIBCUDACXX_INLINE_VISIBILITY _T
 #else
 
 template <class _Tp>
-inline _LIBCUDACXX_NO_CFI _LIBCUDACXX_INLINE_VISIBILITY _Tp* addressof(_Tp& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_NO_CFI _Tp* addressof(_Tp& __x) noexcept
 {
   return reinterpret_cast<_Tp*>(const_cast<char*>(&reinterpret_cast<const volatile char&>(__x)));
 }
@@ -49,27 +49,27 @@ inline _LIBCUDACXX_NO_CFI _LIBCUDACXX_INLINE_VISIBILITY _Tp* addressof(_Tp& __x)
 // _LIBCUDACXX_PREDEFINED_OBJC_ARC_ADDRESSOF is defined, the compiler
 // itself is providing these definitions. Otherwise, we provide them.
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __strong _Tp* addressof(__strong _Tp& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __strong _Tp* addressof(__strong _Tp& __x) noexcept
 {
   return &__x;
 }
 
 #  ifdef _LIBCUDACXX_HAS_OBJC_ARC_WEAK
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __weak _Tp* addressof(__weak _Tp& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __weak _Tp* addressof(__weak _Tp& __x) noexcept
 {
   return &__x;
 }
 #  endif
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __autoreleasing _Tp* addressof(__autoreleasing _Tp& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __autoreleasing _Tp* addressof(__autoreleasing _Tp& __x) noexcept
 {
   return &__x;
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __unsafe_unretained _Tp* addressof(__unsafe_unretained _Tp& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __unsafe_unretained _Tp* addressof(__unsafe_unretained _Tp& __x) noexcept
 {
   return &__x;
 }
diff --git a/libcudacxx/include/cuda/std/__memory/align.h b/libcudacxx/include/cuda/std/__memory/align.h
index 5c669494a0..605ea1cac7 100644
--- a/libcudacxx/include/cuda/std/__memory/align.h
+++ b/libcudacxx/include/cuda/std/__memory/align.h
@@ -30,7 +30,7 @@ _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus operator applied to unsigned type,
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void* align(size_t __alignment, size_t __size, void*& __ptr, size_t& __space)
+_LIBCUDACXX_HIDE_FROM_ABI void* align(size_t __alignment, size_t __size, void*& __ptr, size_t& __space)
 {
   if (__space < __size)
   {
diff --git a/libcudacxx/include/cuda/std/__memory/allocate_at_least.h b/libcudacxx/include/cuda/std/__memory/allocate_at_least.h
index 54cd0aa47e..4b620c8e39 100644
--- a/libcudacxx/include/cuda/std/__memory/allocate_at_least.h
+++ b/libcudacxx/include/cuda/std/__memory/allocate_at_least.h
@@ -37,8 +37,7 @@ struct allocation_result
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(allocation_result);
 
 template <class _Alloc>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI
-_LIBCUDACXX_INLINE_VISIBILITY constexpr allocation_result<typename allocator_traits<_Alloc>::pointer>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr allocation_result<typename allocator_traits<_Alloc>::pointer>
 allocate_at_least(_Alloc& __alloc, size_t __n)
 {
   if constexpr (requires { __alloc.allocate_at_least(__n); })
@@ -52,8 +51,7 @@ allocate_at_least(_Alloc& __alloc, size_t __n)
 }
 
 template <class _Alloc>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
-__allocate_at_least(_Alloc& __alloc, size_t __n)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto __allocate_at_least(_Alloc& __alloc, size_t __n)
 {
   return _CUDA_VSTD::allocate_at_least(__alloc, __n);
 }
@@ -66,8 +64,7 @@ struct __allocation_result
 };
 
 template <class _Alloc>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __allocation_result<typename allocator_traits<_Alloc>::pointer>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __allocation_result<typename allocator_traits<_Alloc>::pointer>
 __allocate_at_least(_Alloc& __alloc, size_t __n)
 {
   return {__alloc.allocate(__n), __n};
diff --git a/libcudacxx/include/cuda/std/__memory/allocation_guard.h b/libcudacxx/include/cuda/std/__memory/allocation_guard.h
index 75b9469904..2851536932 100644
--- a/libcudacxx/include/cuda/std/__memory/allocation_guard.h
+++ b/libcudacxx/include/cuda/std/__memory/allocation_guard.h
@@ -57,14 +57,13 @@ struct __allocation_guard
   using _Size    = typename allocator_traits<_Alloc>::size_type;
 
   template <class _AllocT> // we perform the allocator conversion inside the constructor
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX20 explicit __allocation_guard(_AllocT __alloc, _Size __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit __allocation_guard(_AllocT __alloc, _Size __n)
       : __alloc_(_CUDA_VSTD::move(__alloc))
       , __n_(__n)
       , __ptr_(allocator_traits<_Alloc>::allocate(__alloc_, __n_)) // initialization order is important
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__allocation_guard() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__allocation_guard() noexcept
   {
     if (__ptr_ != nullptr)
     {
@@ -72,14 +71,14 @@ struct __allocation_guard
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Pointer __release_ptr() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Pointer __release_ptr() noexcept
   { // not called __release() because it's a keyword in objective-c++
     _Pointer __tmp = __ptr_;
     __ptr_         = nullptr;
     return __tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Pointer __get() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Pointer __get() const noexcept
   {
     return __ptr_;
   }
diff --git a/libcudacxx/include/cuda/std/__memory/allocator.h b/libcudacxx/include/cuda/std/__memory/allocator.h
index 35ea083725..1274f9020b 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator.h
@@ -25,14 +25,14 @@
 #include <cuda/std/__memory/addressof.h>
 #include <cuda/std/__memory/allocate_at_least.h>
 #include <cuda/std/__memory/allocator_traits.h>
+#include <cuda/std/__new_>
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/is_void.h>
 #include <cuda/std/__type_traits/is_volatile.h>
 #include <cuda/std/__utility/forward.h>
 #include <cuda/std/cstddef>
-#include <cuda/std/detail/libcxx/include/cstdlib>
-#include <cuda/std/detail/libcxx/include/new>
+#include <cuda/std/cstdlib>
 
 #if defined(_CCCL_HAS_CONSTEXPR_ALLOCATION) && !defined(_CCCL_COMPILER_NVRTC)
 #  include <memory>
@@ -47,7 +47,7 @@ class allocator;
 // These specializations shouldn't be marked _LIBCUDACXX_DEPRECATED_IN_CXX17.
 // Specializing allocator<void> is deprecated, but not using it.
 template <>
-class _LIBCUDACXX_TEMPLATE_VIS allocator<void>
+class _CCCL_TYPE_VISIBILITY_DEFAULT allocator<void>
 {
 public:
   _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef void* pointer;
@@ -62,7 +62,7 @@ class _LIBCUDACXX_TEMPLATE_VIS allocator<void>
 };
 
 template <>
-class _LIBCUDACXX_TEMPLATE_VIS allocator<const void>
+class _CCCL_TYPE_VISIBILITY_DEFAULT allocator<const void>
 {
 public:
   _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef const void* pointer;
@@ -96,7 +96,7 @@ struct __non_trivial_if
 template <class _Unique>
 struct __non_trivial_if<true, _Unique>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __non_trivial_if() noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __non_trivial_if() noexcept {}
 };
 
 // allocator
@@ -105,7 +105,7 @@ struct __non_trivial_if<true, _Unique>
 //       allocator<void> trivial in C++20.
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS allocator : private __non_trivial_if<!_CCCL_TRAIT(is_void, _Tp), allocator<_Tp>>
+class _CCCL_TYPE_VISIBILITY_DEFAULT allocator : private __non_trivial_if<!_CCCL_TRAIT(is_void, _Tp), allocator<_Tp>>
 {
   static_assert(!_CCCL_TRAIT(is_volatile, _Tp), "std::allocator does not support volatile types");
 
@@ -119,11 +119,11 @@ class _LIBCUDACXX_TEMPLATE_VIS allocator : private __non_trivial_if<!_CCCL_TRAIT
   _CCCL_CONSTEXPR_CXX20 allocator() noexcept = default;
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 allocator(const allocator<_Up>&) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 allocator(const allocator<_Up>&) noexcept
   {}
 
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20_ALLOCATION _Tp* allocate(size_t __n)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20_ALLOCATION _Tp* allocate(size_t __n)
   {
     if (__n > allocator_traits<allocator>::max_size(*this))
     {
@@ -141,15 +141,14 @@ class _LIBCUDACXX_TEMPLATE_VIS allocator : private __non_trivial_if<!_CCCL_TRAIT
   }
 
 #if _CCCL_STD_VER >= 2023
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr allocation_result<_Tp*>
-  allocate_at_least(size_t __n)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr allocation_result<_Tp*> allocate_at_least(size_t __n)
   {
     return {allocate(__n), __n};
   }
 #endif // _CCCL_HAS_CONSTEXPR_ALLOCATION
 
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20_ALLOCATION void deallocate(_Tp* __p, size_t __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20_ALLOCATION void deallocate(_Tp* __p, size_t __n) noexcept
   {
 #if defined(_CCCL_HAS_CONSTEXPR_ALLOCATION)
     if (__libcpp_is_constant_evaluated())
@@ -176,32 +175,32 @@ class _LIBCUDACXX_TEMPLATE_VIS allocator : private __non_trivial_if<!_CCCL_TRAIT
     typedef allocator<_Up> other;
   };
 
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VISIBILITY pointer address(reference __x) const noexcept
+  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI pointer address(reference __x) const noexcept
   {
     return _CUDA_VSTD::addressof(__x);
   }
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VISIBILITY const_pointer address(const_reference __x) const noexcept
+  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI const_pointer address(const_reference __x) const noexcept
   {
     return _CUDA_VSTD::addressof(__x);
   }
 
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_DEPRECATED_IN_CXX17 _Tp* allocate(size_t __n, const void*)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_DEPRECATED_IN_CXX17 _Tp* allocate(size_t __n, const void*)
   {
     return allocate(__n);
   }
 
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VISIBILITY size_type max_size() const noexcept
+  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI size_type max_size() const noexcept
   {
     return size_type(~0) / sizeof(_Tp);
   }
 
   template <class _Up, class... _Args>
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VISIBILITY void construct(_Up* __p, _Args&&... __args)
+  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI void construct(_Up* __p, _Args&&... __args)
   {
     ::new ((void*) __p) _Up(_CUDA_VSTD::forward<_Args>(__args)...);
   }
 
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VISIBILITY void destroy(pointer __p)
+  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI void destroy(pointer __p) noexcept
   {
     __p->~_Tp();
   }
@@ -209,7 +208,7 @@ class _LIBCUDACXX_TEMPLATE_VIS allocator : private __non_trivial_if<!_CCCL_TRAIT
 };
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS allocator<const _Tp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT allocator<const _Tp>
     : private __non_trivial_if<!_CCCL_TRAIT(is_void, _Tp), allocator<const _Tp>>
 {
   static_assert(!_CCCL_TRAIT(is_volatile, _Tp), "std::allocator does not support volatile types");
@@ -224,10 +223,10 @@ class _LIBCUDACXX_TEMPLATE_VIS allocator<const _Tp>
   _CCCL_CONSTEXPR_CXX20 allocator() noexcept = default;
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 allocator(const allocator<_Up>&) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 allocator(const allocator<_Up>&) noexcept
   {}
 
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 const _Tp* allocate(size_t __n)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 const _Tp* allocate(size_t __n)
   {
     if (__n > allocator_traits<allocator>::max_size(*this))
     {
@@ -244,14 +243,13 @@ class _LIBCUDACXX_TEMPLATE_VIS allocator<const _Tp>
   }
 
 #if _CCCL_STD_VER >= 2023
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr allocation_result<const _Tp*>
-  allocate_at_least(size_t __n)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr allocation_result<const _Tp*> allocate_at_least(size_t __n)
   {
     return {allocate(__n), __n};
   }
 #endif // _CCCL_STD_VER >= 2023
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void deallocate(const _Tp* __p, size_t __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void deallocate(const _Tp* __p, size_t __n) noexcept
   {
     if (__libcpp_is_constant_evaluated())
     {
@@ -276,29 +274,28 @@ class _LIBCUDACXX_TEMPLATE_VIS allocator<const _Tp>
     typedef allocator<_Up> other;
   };
 
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VISIBILITY const_pointer address(const_reference __x) const noexcept
+  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI const_pointer address(const_reference __x) const noexcept
   {
     return _CUDA_VSTD::addressof(__x);
   }
 
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_DEPRECATED_IN_CXX17 const _Tp*
-  allocate(size_t __n, const void*)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_DEPRECATED_IN_CXX17 const _Tp* allocate(size_t __n, const void*)
   {
     return allocate(__n);
   }
 
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VISIBILITY size_type max_size() const noexcept
+  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI size_type max_size() const noexcept
   {
     return size_type(~0) / sizeof(_Tp);
   }
 
   template <class _Up, class... _Args>
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VISIBILITY void construct(_Up* __p, _Args&&... __args)
+  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI void construct(_Up* __p, _Args&&... __args)
   {
     ::new ((void*) __p) _Up(_CUDA_VSTD::forward<_Args>(__args)...);
   }
 
-  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VISIBILITY void destroy(pointer __p)
+  _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_HIDE_FROM_ABI void destroy(pointer __p) noexcept
   {
     __p->~_Tp();
   }
@@ -306,15 +303,13 @@ class _LIBCUDACXX_TEMPLATE_VIS allocator<const _Tp>
 };
 
 template <class _Tp, class _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator==(const allocator<_Tp>&, const allocator<_Up>&) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator==(const allocator<_Tp>&, const allocator<_Up>&) noexcept
 {
   return true;
 }
 
 template <class _Tp, class _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator!=(const allocator<_Tp>&, const allocator<_Up>&) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator!=(const allocator<_Tp>&, const allocator<_Up>&) noexcept
 {
   return false;
 }
diff --git a/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h b/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h
index 33b03c45d1..540882368e 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator_arg_t.h
@@ -30,9 +30,9 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-struct _LIBCUDACXX_TEMPLATE_VIS allocator_arg_t
+struct _CCCL_TYPE_VISIBILITY_DEFAULT allocator_arg_t
 {
-  explicit allocator_arg_t() = default;
+  _CCCL_HIDE_FROM_ABI explicit allocator_arg_t() = default;
 };
 
 _LIBCUDACXX_INLINE_VAR constexpr allocator_arg_t allocator_arg = allocator_arg_t();
@@ -53,7 +53,7 @@ struct __uses_alloc_ctor : integral_constant<int, __uses_alloc_ctor_imp<_Tp, _Al
 {};
 
 template <class _Tp, class _Allocator, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 __user_alloc_construct_impl(integral_constant<int, 0>, _Tp* __storage, const _Allocator&, _Args&&... __args)
 {
   new (__storage) _Tp(_CUDA_VSTD::forward<_Args>(__args)...);
@@ -61,7 +61,7 @@ __user_alloc_construct_impl(integral_constant<int, 0>, _Tp* __storage, const _Al
 
 // FIXME: This should have a version which takes a non-const alloc.
 template <class _Tp, class _Allocator, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 __user_alloc_construct_impl(integral_constant<int, 1>, _Tp* __storage, const _Allocator& __a, _Args&&... __args)
 {
   new (__storage) _Tp(allocator_arg, __a, _CUDA_VSTD::forward<_Args>(__args)...);
@@ -69,7 +69,7 @@ __user_alloc_construct_impl(integral_constant<int, 1>, _Tp* __storage, const _Al
 
 // FIXME: This should have a version which takes a non-const alloc.
 template <class _Tp, class _Allocator, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 __user_alloc_construct_impl(integral_constant<int, 2>, _Tp* __storage, const _Allocator& __a, _Args&&... __args)
 {
   new (__storage) _Tp(_CUDA_VSTD::forward<_Args>(__args)..., __a);
diff --git a/libcudacxx/include/cuda/std/__memory/allocator_destructor.h b/libcudacxx/include/cuda/std/__memory/allocator_destructor.h
index a3b9f9f73d..be8a685e45 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator_destructor.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator_destructor.h
@@ -40,11 +40,11 @@ class __allocator_destructor
   size_type __s_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __allocator_destructor(_Alloc& __a, size_type __s) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI __allocator_destructor(_Alloc& __a, size_type __s) noexcept
       : __alloc_(__a)
       , __s_(__s)
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY void operator()(pointer __p) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void operator()(pointer __p) noexcept
   {
     __alloc_traits::deallocate(__alloc_, __p, __s_);
   }
diff --git a/libcudacxx/include/cuda/std/__memory/allocator_traits.h b/libcudacxx/include/cuda/std/__memory/allocator_traits.h
index 584d8c7c66..72a03a7a56 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator_traits.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator_traits.h
@@ -277,28 +277,27 @@ struct __has_select_on_container_copy_construction<
 {};
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __to_raw_pointer(_Tp* __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp* __to_raw_pointer(_Tp* __p) noexcept
 {
   return __p;
 }
 
 #if _CCCL_STD_VER <= 2017
 template <class _Pointer>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename pointer_traits<_Pointer>::element_type*
-__to_raw_pointer(_Pointer __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI typename pointer_traits<_Pointer>::element_type* __to_raw_pointer(_Pointer __p) noexcept
 {
   return _CUDA_VSTD::__to_raw_pointer(__p.operator->());
 }
 #else // ^^^ C++17 ^^^ / vvv C++20 vvv
 template <class _Pointer>
-inline _LIBCUDACXX_INLINE_VISIBILITY auto
+_LIBCUDACXX_HIDE_FROM_ABI auto
 __to_raw_pointer(const _Pointer& __p) noexcept -> decltype(pointer_traits<_Pointer>::to_address(__p))
 {
   return pointer_traits<_Pointer>::to_address(__p);
 }
 
 template <class _Pointer, class... _None>
-inline _LIBCUDACXX_INLINE_VISIBILITY auto __to_raw_pointer(const _Pointer& __p, _None...) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI auto __to_raw_pointer(const _Pointer& __p, _None...) noexcept
 {
   return _CUDA_VSTD::__to_raw_pointer(__p.operator->());
 }
@@ -347,7 +346,7 @@ struct __is_cpp17_copy_insertable<
 {};
 
 template <class _Alloc>
-struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
+struct _CCCL_TYPE_VISIBILITY_DEFAULT allocator_traits
 {
   using allocator_type     = _Alloc;
   using value_type         = typename allocator_type::value_type;
@@ -369,14 +368,14 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
   template <class _Tp>
   using rebind_traits = allocator_traits<rebind_alloc<_Tp>>;
 
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static pointer
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static pointer
   allocate(allocator_type& __a, size_type __n)
   {
     return __a.allocate(__n);
   }
 
   template <class _Ap = _Alloc, __enable_if_t<__has_allocate_hint<_Ap, size_type, const_void_pointer>::value, int> = 0>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static pointer
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static pointer
   allocate(allocator_type& __a, size_type __n, const_void_pointer __hint)
   {
     _CCCL_SUPPRESS_DEPRECATED_PUSH
@@ -386,21 +385,20 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
   template <class _Ap                                                                           = _Alloc,
             class                                                                               = void,
             __enable_if_t<!__has_allocate_hint<_Ap, size_type, const_void_pointer>::value, int> = 0>
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static pointer
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static pointer
   allocate(allocator_type& __a, size_type __n, const_void_pointer)
   {
     return __a.allocate(__n);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static void
   deallocate(allocator_type& __a, pointer __p, size_type __n) noexcept
   {
     __a.deallocate(__p, __n);
   }
 
   template <class _Tp, class... _Args, __enable_if_t<__has_construct<allocator_type, _Tp*, _Args...>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static void
-  construct(allocator_type& __a, _Tp* __p, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static void construct(allocator_type& __a, _Tp* __p, _Args&&... __args)
   {
     _CCCL_SUPPRESS_DEPRECATED_PUSH
     __a.construct(__p, _CUDA_VSTD::forward<_Args>(__args)...);
@@ -410,7 +408,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
             class... _Args,
             class                                                                       = void,
             __enable_if_t<!__has_construct<allocator_type, _Tp*, _Args...>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static void construct(allocator_type&, _Tp* __p, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static void construct(allocator_type&, _Tp* __p, _Args&&... __args)
   {
 #if _CCCL_STD_VER >= 2020
     _CUDA_VSTD::construct_at(__p, _CUDA_VSTD::forward<_Args>(__args)...);
@@ -420,14 +418,14 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
   }
 
   template <class _Tp, __enable_if_t<__has_destroy<allocator_type, _Tp*>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static void destroy(allocator_type& __a, _Tp* __p)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static void destroy(allocator_type& __a, _Tp* __p) noexcept
   {
     _CCCL_SUPPRESS_DEPRECATED_PUSH
     __a.destroy(__p);
     _CCCL_SUPPRESS_DEPRECATED_POP
   }
   template <class _Tp, class = void, __enable_if_t<!__has_destroy<allocator_type, _Tp*>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static void destroy(allocator_type&, _Tp* __p)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static void destroy(allocator_type&, _Tp* __p) noexcept
   {
 #if _CCCL_STD_VER >= 2020
     _CUDA_VSTD::destroy_at(__p);
@@ -438,19 +436,19 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
 
   _CCCL_SUPPRESS_DEPRECATED_PUSH
   template <class _Ap = _Alloc, __enable_if_t<__has_max_size<const _Ap>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static size_type max_size(const allocator_type& __a) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static size_type max_size(const allocator_type& __a) noexcept
   {
     return __a.max_size();
   }
   _CCCL_SUPPRESS_DEPRECATED_POP
   template <class _Ap = _Alloc, class = void, __enable_if_t<!__has_max_size<const _Ap>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static size_type max_size(const allocator_type&) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static size_type max_size(const allocator_type&) noexcept
   {
     return numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   template <class _Ap = _Alloc, __enable_if_t<__has_select_on_container_copy_construction<const _Ap>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static allocator_type
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static allocator_type
   select_on_container_copy_construction(const allocator_type& __a)
   {
     return __a.select_on_container_copy_construction();
@@ -458,14 +456,14 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
   template <class _Ap                                                                          = _Alloc,
             class                                                                              = void,
             __enable_if_t<!__has_select_on_container_copy_construction<const _Ap>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static allocator_type
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static allocator_type
   select_on_container_copy_construction(const allocator_type& __a)
   {
     return __a;
   }
 
   template <class _Ptr>
-  _LIBCUDACXX_INLINE_VISIBILITY static void
+  _LIBCUDACXX_HIDE_FROM_ABI static void
   __construct_forward_with_exception_guarantees(allocator_type& __a, _Ptr __begin1, _Ptr __end1, _Ptr& __begin2)
   {
     static_assert(__is_cpp17_move_insertable<allocator_type>::value,
@@ -474,17 +472,17 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
     {
       construct(__a,
                 _CUDA_VSTD::__to_raw_pointer(__begin2),
-#ifdef _LIBCUDACXX_NO_EXCEPTIONS
+#ifdef _CCCL_NO_EXCEPTIONS
                 _CUDA_VSTD::move(*__begin1)
-#else
+#else // ^^^ _CCCL_NO_EXCEPTIONS ^^^ / vvv !_CCCL_NO_EXCEPTIONS vvv
                 _CUDA_VSTD::move_if_noexcept(*__begin1)
-#endif
+#endif // !_CCCL_NO_EXCEPTIONS
       );
     }
   }
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY static __enable_if_t<
+  _LIBCUDACXX_HIDE_FROM_ABI static __enable_if_t<
     (__is_default_allocator<allocator_type>::value || !__has_construct<allocator_type, _Tp*, _Tp>::value)
       && is_trivially_move_constructible<_Tp>::value,
     void>
@@ -499,7 +497,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
   }
 
   template <class _Iter, class _Ptr>
-  _LIBCUDACXX_INLINE_VISIBILITY static void
+  _LIBCUDACXX_HIDE_FROM_ABI static void
   __construct_range_forward(allocator_type& __a, _Iter __begin1, _Iter __end1, _Ptr& __begin2)
   {
     for (; __begin1 != __end1; ++__begin1, (void) ++__begin2)
@@ -512,7 +510,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
             class _DestTp,
             class _RawSourceTp = __remove_const_t<_SourceTp>,
             class _RawDestTp   = __remove_const_t<_DestTp>>
-  _LIBCUDACXX_INLINE_VISIBILITY static __enable_if_t<
+  _LIBCUDACXX_HIDE_FROM_ABI static __enable_if_t<
     is_trivially_move_constructible<_DestTp>::value && is_same<_RawSourceTp, _RawDestTp>::value
       && (__is_default_allocator<allocator_type>::value || !__has_construct<allocator_type, _DestTp*, _SourceTp&>::value),
     void>
@@ -527,7 +525,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
   }
 
   template <class _Ptr>
-  _LIBCUDACXX_INLINE_VISIBILITY static void
+  _LIBCUDACXX_HIDE_FROM_ABI static void
   __construct_backward_with_exception_guarantees(allocator_type& __a, _Ptr __begin1, _Ptr __end1, _Ptr& __end2)
   {
     static_assert(__is_cpp17_move_insertable<allocator_type>::value,
@@ -536,18 +534,18 @@ struct _LIBCUDACXX_TEMPLATE_VIS allocator_traits
     {
       construct(__a,
                 _CUDA_VSTD::__to_raw_pointer(__end2 - 1),
-#ifdef _LIBCUDACXX_NO_EXCEPTIONS
+#ifdef _CCCL_NO_EXCEPTIONS
                 _CUDA_VSTD::move(*--__end1)
-#else
+#else // ^^^ _CCCL_NO_EXCEPTIONS ^^^ / vvv !_CCCL_NO_EXCEPTIONS vvv
                 _CUDA_VSTD::move_if_noexcept(*--__end1)
-#endif
+#endif // !_CCCL_NO_EXCEPTIONS
       );
       --__end2;
     }
   }
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY static __enable_if_t<
+  _LIBCUDACXX_HIDE_FROM_ABI static __enable_if_t<
     (__is_default_allocator<allocator_type>::value || !__has_construct<allocator_type, _Tp*, _Tp>::value)
       && is_trivially_move_constructible<_Tp>::value,
     void>
diff --git a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h
index 65ec8ae9af..27da168bfc 100644
--- a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h
+++ b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h
@@ -23,8 +23,8 @@
 #endif // no system header
 
 #include <cuda/std/__memory/unique_ptr.h>
+#include <cuda/std/__new_>
 #include <cuda/std/cstddef>
-#include <cuda/std/detail/libcxx/include/new>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -38,13 +38,12 @@ struct __builtin_new_allocator
   {
     typedef void* pointer_type;
 
-    _LIBCUDACXX_HIDE_FROM_ABI
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __builtin_new_deleter(size_t __size, size_t __align) noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __builtin_new_deleter(size_t __size, size_t __align) noexcept
         : __size_(__size)
         , __align_(__align)
     {}
 
-    _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void operator()(void* __p) const noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI void operator()(void* __p) const noexcept
     {
       _CUDA_VSTD::__libcpp_deallocate(__p, __size_, __align_);
     }
@@ -56,27 +55,24 @@ struct __builtin_new_allocator
 
   typedef unique_ptr<void, __builtin_new_deleter> __holder_t;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static __holder_t __allocate_bytes(size_t __s, size_t __align)
+  _LIBCUDACXX_HIDE_FROM_ABI static __holder_t __allocate_bytes(size_t __s, size_t __align)
   {
     return __holder_t(_CUDA_VSTD::__libcpp_allocate(__s, __align), __builtin_new_deleter(__s, __align));
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static void
-  __deallocate_bytes(void* __p, size_t __s, size_t __align) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static void __deallocate_bytes(void* __p, size_t __s, size_t __align) noexcept
   {
     _CUDA_VSTD::__libcpp_deallocate(__p, __s, __align);
   }
 
   template <class _Tp>
-  _LIBCUDACXX_NODEBUG_TYPE _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static __holder_t
-  __allocate_type(size_t __n)
+  _LIBCUDACXX_NODEBUG_TYPE _LIBCUDACXX_HIDE_FROM_ABI static __holder_t __allocate_type(size_t __n)
   {
     return __allocate_bytes(__n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp));
   }
 
   template <class _Tp>
-  _LIBCUDACXX_NODEBUG_TYPE _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static void
-  __deallocate_type(void* __p, size_t __n) noexcept
+  _LIBCUDACXX_NODEBUG_TYPE _LIBCUDACXX_HIDE_FROM_ABI static void __deallocate_type(void* __p, size_t __n) noexcept
   {
     __deallocate_bytes(__p, __n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp));
   }
diff --git a/libcudacxx/include/cuda/std/__memory/compressed_pair.h b/libcudacxx/include/cuda/std/__memory/compressed_pair.h
index 415632c1e4..463b5d901a 100644
--- a/libcudacxx/include/cuda/std/__memory/compressed_pair.h
+++ b/libcudacxx/include/cuda/std/__memory/compressed_pair.h
@@ -55,33 +55,33 @@ struct __compressed_pair_elem
   using reference       = _Tp&;
   using const_reference = const _Tp&;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __compressed_pair_elem(
-    __default_init_tag) noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __compressed_pair_elem(__default_init_tag) noexcept(
+    _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
   {}
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __compressed_pair_elem(
-    __value_init_tag) noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __compressed_pair_elem(__value_init_tag) noexcept(
+    _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
       : __value_()
   {}
 
   template <class _Up, __enable_if_t<!_CCCL_TRAIT(is_same, __compressed_pair_elem, __decay_t<_Up>), int> = 0>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __compressed_pair_elem(_Up&& __u) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __compressed_pair_elem(_Up&& __u) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Up))
       : __value_(_CUDA_VSTD::forward<_Up>(__u))
   {}
 
   template <class... _Args, size_t... _Indices>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 explicit __compressed_pair_elem(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 explicit __compressed_pair_elem(
     piecewise_construct_t,
     tuple<_Args...> __args,
     __tuple_indices<_Indices...>) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
       : __value_(_CUDA_VSTD::forward<_Args>(_CUDA_VSTD::get<_Indices>(__args))...)
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference __get() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference __get() noexcept
   {
     return __value_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __get() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference __get() const noexcept
   {
     return __value_;
   }
@@ -98,35 +98,35 @@ struct __compressed_pair_elem<_Tp, _Idx, true> : private _Tp
   using const_reference = const _Tp&;
   using __value_type    = _Tp;
 
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __compressed_pair_elem() = default;
+  _CCCL_HIDE_FROM_ABI explicit constexpr __compressed_pair_elem() = default;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __compressed_pair_elem(
-    __default_init_tag) noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __compressed_pair_elem(__default_init_tag) noexcept(
+    _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
   {}
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __compressed_pair_elem(
-    __value_init_tag) noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __compressed_pair_elem(__value_init_tag) noexcept(
+    _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
       : __value_type()
   {}
 
   template <class _Up, __enable_if_t<!_CCCL_TRAIT(is_same, __compressed_pair_elem, __decay_t<_Up>), int> = 0>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __compressed_pair_elem(_Up&& __u) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __compressed_pair_elem(_Up&& __u) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Up))
       : __value_type(_CUDA_VSTD::forward<_Up>(__u))
   {}
 
   template <class... _Args, size_t... _Indices>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 __compressed_pair_elem(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 __compressed_pair_elem(
     piecewise_construct_t,
     tuple<_Args...> __args,
     __tuple_indices<_Indices...>) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
       : __value_type(_CUDA_VSTD::forward<_Args>(_CUDA_VSTD::get<_Indices>(__args))...)
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference __get() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference __get() noexcept
   {
     return *this;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __get() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference __get() const noexcept
   {
     return *this;
   }
@@ -153,21 +153,21 @@ class __compressed_pair
   template <bool _Dummy = true,
             class       = __enable_if_t<__dependent_type<is_default_constructible<_T1>, _Dummy>::value
                                         && __dependent_type<is_default_constructible<_T2>, _Dummy>::value>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __compressed_pair() noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __compressed_pair() noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _T1) && _CCCL_TRAIT(is_nothrow_default_constructible, _T2))
       : _Base1(__value_init_tag())
       , _Base2(__value_init_tag())
   {}
 
   template <class _U1, class _U2>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __compressed_pair(
-    _U1&& __t1, _U2&& __t2) noexcept(_CCCL_TRAIT(is_constructible, _T1, _U1) && _CCCL_TRAIT(is_constructible, _T2, _U2))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __compressed_pair(_U1&& __t1, _U2&& __t2) noexcept(
+    _CCCL_TRAIT(is_constructible, _T1, _U1) && _CCCL_TRAIT(is_constructible, _T2, _U2))
       : _Base1(_CUDA_VSTD::forward<_U1>(__t1))
       , _Base2(_CUDA_VSTD::forward<_U2>(__t2))
   {}
 
   template <class... _Args1, class... _Args2>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 explicit __compressed_pair(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 explicit __compressed_pair(
     piecewise_construct_t __pc,
     tuple<_Args1...> __first_args,
     tuple<_Args2...> __second_args) noexcept(_CCCL_TRAIT(is_constructible, _T1, _Args1...)
@@ -176,42 +176,36 @@ class __compressed_pair
       , _Base2(__pc, _CUDA_VSTD::move(__second_args), typename __make_tuple_indices<sizeof...(_Args2)>::type())
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 typename _Base1::reference
-  first() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename _Base1::reference first() noexcept
   {
     return static_cast<_Base1&>(*this).__get();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr typename _Base1::const_reference
-  first() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr typename _Base1::const_reference first() const noexcept
   {
     return static_cast<_Base1 const&>(*this).__get();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 typename _Base2::reference
-  second() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename _Base2::reference second() noexcept
   {
     return static_cast<_Base2&>(*this).__get();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr typename _Base2::const_reference
-  second() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr typename _Base2::const_reference second() const noexcept
   {
     return static_cast<_Base2 const&>(*this).__get();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr static _Base1*
-  __get_first_base(__compressed_pair* __pair) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr static _Base1* __get_first_base(__compressed_pair* __pair) noexcept
   {
     return static_cast<_Base1*>(__pair);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr static _Base2*
-  __get_second_base(__compressed_pair* __pair) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr static _Base2* __get_second_base(__compressed_pair* __pair) noexcept
   {
     return static_cast<_Base2*>(__pair);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
   swap(__compressed_pair& __x) noexcept(__is_nothrow_swappable<_T1>::value && __is_nothrow_swappable<_T2>::value)
   {
     using _CUDA_VSTD::swap;
@@ -221,7 +215,7 @@ class __compressed_pair
 };
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void swap(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void swap(
   __compressed_pair<_T1, _T2>& __x,
   __compressed_pair<_T1, _T2>& __y) noexcept(__is_nothrow_swappable<_T1>::value && __is_nothrow_swappable<_T2>::value)
 {
diff --git a/libcudacxx/include/cuda/std/__memory/construct_at.h b/libcudacxx/include/cuda/std/__memory/construct_at.h
index 2837765f1a..3d01e03919 100644
--- a/libcudacxx/include/cuda/std/__memory/construct_at.h
+++ b/libcudacxx/include/cuda/std/__memory/construct_at.h
@@ -31,6 +31,7 @@
 #include <cuda/std/__type_traits/is_array.h>
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/__type_traits/is_trivially_constructible.h>
+#include <cuda/std/__type_traits/is_trivially_destructible.h>
 #include <cuda/std/__type_traits/is_trivially_move_assignable.h>
 #include <cuda/std/__type_traits/void_t.h>
 #include <cuda/std/__utility/declval.h>
@@ -53,7 +54,7 @@ namespace std
 template <class _Tp,
           class... _Args,
           class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* construct_at(_Tp* __location, _Args&&... __args)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp* construct_at(_Tp* __location, _Args&&... __args)
 {
 #    if defined(_LIBCUDACXX_ADDRESSOF)
   return ::new (_CUDA_VSTD::__voidify(*__location)) _Tp(_CUDA_VSTD::forward<_Args>(__args)...);
@@ -108,7 +109,7 @@ _CCCL_EXEC_CHECK_DISABLE
 template <class _Tp,
           class... _Args,
           class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX20 __enable_if_t<!__detail::__can_optimize_construct_at<_Tp, _Args...>::value, _Tp*>
 construct_at(_Tp* __location, _Args&&... __args)
 {
@@ -125,7 +126,7 @@ _CCCL_EXEC_CHECK_DISABLE
 template <class _Tp,
           class... _Args,
           class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX20 __enable_if_t<__detail::__can_optimize_construct_at<_Tp, _Args...>::value, _Tp*>
 construct_at(_Tp* __location, _Args&&... __args)
 {
@@ -143,7 +144,7 @@ construct_at(_Tp* __location, _Args&&... __args)
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX20 __enable_if_t<!__detail::__can_optimize_construct_at<_Tp, _Args...>::value, _Tp*>
 __construct_at(_Tp* __location, _Args&&... __args)
 {
@@ -160,7 +161,7 @@ __construct_at(_Tp* __location, _Args&&... __args)
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX20 __enable_if_t<__detail::__can_optimize_construct_at<_Tp, _Args...>::value, _Tp*>
 __construct_at(_Tp* __location, _Args&&... __args)
 {
@@ -181,26 +182,39 @@ __construct_at(_Tp* __location, _Args&&... __args)
 // The internal functions are available regardless of the language version (with the exception of the `__destroy_at`
 // taking an array).
 template <class _ForwardIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _ForwardIterator __destroy(_ForwardIterator, _ForwardIterator);
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __destroy(_ForwardIterator, _ForwardIterator);
 
-template <class _Tp, __enable_if_t<!is_array<_Tp>::value, int> = 0>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void __destroy_at(_Tp* __loc)
+_CCCL_EXEC_CHECK_DISABLE
+template <class _Tp,
+          __enable_if_t<!is_array<_Tp>::value, int>                  = 0,
+          __enable_if_t<!is_trivially_destructible<_Tp>::value, int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
 {
   _LIBCUDACXX_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
   __loc->~_Tp();
 }
 
+_CCCL_EXEC_CHECK_DISABLE
+template <class _Tp,
+          __enable_if_t<!is_array<_Tp>::value, int>                 = 0,
+          __enable_if_t<is_trivially_destructible<_Tp>::value, int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
+{
+  _LIBCUDACXX_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
+  (void) __loc;
+}
+
 #if _CCCL_STD_VER >= 2020
 template <class _Tp, __enable_if_t<is_array<_Tp>::value, int> = 0>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void __destroy_at(_Tp* __loc)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr void __destroy_at(_Tp* __loc)
 {
   _LIBCUDACXX_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
   _CUDA_VSTD::__destroy(_CUDA_VSTD::begin(*__loc), _CUDA_VSTD::end(*__loc));
 }
-#endif
+#endif // _CCCL_STD_VER >= 2020
 
 template <class _ForwardIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
 __destroy(_ForwardIterator __first, _ForwardIterator __last)
 {
   for (; __first != __last; ++__first)
@@ -211,7 +225,7 @@ __destroy(_ForwardIterator __first, _ForwardIterator __last)
 }
 
 template <class _BidirectionalIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _BidirectionalIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BidirectionalIterator
 __reverse_destroy(_BidirectionalIterator __first, _BidirectionalIterator __last)
 {
   while (__last != __first)
@@ -222,10 +236,10 @@ __reverse_destroy(_BidirectionalIterator __first, _BidirectionalIterator __last)
   return __last;
 }
 
-#if _CCCL_STD_VER > 2014
+#if _CCCL_STD_VER >= 2017
 
 template <class _Tp, enable_if_t<!is_array_v<_Tp>, int> = 0>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc) noexcept
 {
   _LIBCUDACXX_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
   __loc->~_Tp();
@@ -233,20 +247,20 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc)
 
 #  if _CCCL_STD_VER >= 2020
 template <class _Tp, enable_if_t<is_array_v<_Tp>, int> = 0>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc) noexcept
 {
   _CUDA_VSTD::__destroy_at(__loc);
 }
 #  endif // _CCCL_STD_VER >= 2020
 
 template <class _ForwardIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void destroy(_ForwardIterator __first, _ForwardIterator __last)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy(_ForwardIterator __first, _ForwardIterator __last) noexcept
 {
   (void) _CUDA_VSTD::__destroy(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last));
 }
 
 template <class _ForwardIterator, class _Size>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _ForwardIterator destroy_n(_ForwardIterator __first, _Size __n)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _ForwardIterator destroy_n(_ForwardIterator __first, _Size __n)
 {
   for (; __n > 0; (void) ++__first, --__n)
   {
@@ -255,7 +269,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _ForwardIterator destroy_n(_
   return __first;
 }
 
-#endif // _CCCL_STD_VER > 2014
+#endif // _CCCL_STD_VER >= 2017
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__memory/destruct_n.h b/libcudacxx/include/cuda/std/__memory/destruct_n.h
index 93a4354a06..8124b90f06 100644
--- a/libcudacxx/include/cuda/std/__memory/destruct_n.h
+++ b/libcudacxx/include/cuda/std/__memory/destruct_n.h
@@ -34,7 +34,7 @@ struct __destruct_n
   size_t __size_;
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY void __process(_Tp* __p, false_type) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void __process(_Tp* __p, false_type) noexcept
   {
     for (size_t __i = 0; __i < __size_; ++__i, ++__p)
     {
@@ -43,40 +43,40 @@ struct __destruct_n
   }
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY void __process(_Tp*, true_type) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void __process(_Tp*, true_type) noexcept
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY void __incr(false_type) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void __incr(false_type) noexcept
   {
     ++__size_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void __incr(true_type) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI void __incr(true_type) noexcept {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY void __set(size_t __s, false_type) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void __set(size_t __s, false_type) noexcept
   {
     __size_ = __s;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void __set(size_t, true_type) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI void __set(size_t, true_type) noexcept {}
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __destruct_n(size_t __s) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __destruct_n(size_t __s) noexcept
       : __size_(__s)
   {}
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY void __incr() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void __incr() noexcept
   {
     __incr(integral_constant<bool, is_trivially_destructible<_Tp>::value>());
   }
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY void __set(size_t __s, _Tp*) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void __set(size_t __s, _Tp*) noexcept
   {
     __set(__s, integral_constant<bool, is_trivially_destructible<_Tp>::value>());
   }
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY void operator()(_Tp* __p) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void operator()(_Tp* __p) noexcept
   {
     __process(__p, integral_constant<bool, is_trivially_destructible<_Tp>::value>());
   }
diff --git a/libcudacxx/include/cuda/std/__memory/pointer_traits.h b/libcudacxx/include/cuda/std/__memory/pointer_traits.h
index 62e1eff4b3..329b7cc0c3 100644
--- a/libcudacxx/include/cuda/std/__memory/pointer_traits.h
+++ b/libcudacxx/include/cuda/std/__memory/pointer_traits.h
@@ -143,10 +143,10 @@ struct __has_rebind
 {
 private:
   template <class _Xp>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static false_type __test(...);
   _CCCL_SUPPRESS_DEPRECATED_PUSH
   template <class _Xp>
-  _LIBCUDACXX_INLINE_VISIBILITY static true_type __test(typename _Xp::template rebind<_Up>* = 0);
+  _LIBCUDACXX_HIDE_FROM_ABI static true_type __test(typename _Xp::template rebind<_Up>* = 0);
   _CCCL_SUPPRESS_DEPRECATED_POP
 
 public:
@@ -226,7 +226,7 @@ struct __pointer_traits_rebind<_Sp<_Tp, _A0, _A1, _A2>, _Up, false>
 #endif // _LIBCUDACXX_HAS_NO_VARIADICS
 
 template <class _Ptr>
-struct _LIBCUDACXX_TEMPLATE_VIS pointer_traits
+struct _CCCL_TYPE_VISIBILITY_DEFAULT pointer_traits
 {
   typedef _Ptr pointer;
   typedef typename __pointer_traits_element_type<pointer>::type element_type;
@@ -240,7 +240,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pointer_traits
   {};
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static pointer
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static pointer
   pointer_to(__conditional_t<is_void<element_type>::value, __nat, element_type>& __r)
   {
     return pointer::pointer_to(__r);
@@ -248,7 +248,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pointer_traits
 };
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS pointer_traits<_Tp*>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT pointer_traits<_Tp*>
 {
   typedef _Tp* pointer;
   typedef _Tp element_type;
@@ -262,7 +262,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pointer_traits<_Tp*>
   {};
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 static pointer
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 static pointer
   pointer_to(__conditional_t<is_void<element_type>::value, __nat, element_type>& __r) noexcept
   {
     return _CUDA_VSTD::addressof(__r);
@@ -281,7 +281,7 @@ template <class _Pointer, class = void>
 struct __to_address_helper;
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __to_address(_Tp* __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp* __to_address(_Tp* __p) noexcept
 {
   static_assert(!is_function<_Tp>::value, "_Tp is a function type");
   return __p;
@@ -312,8 +312,7 @@ struct _IsFancyPointer
 
 // enable_if is needed here to avoid instantiating checks for fancy pointers on raw pointers
 template <class _Pointer, class = __enable_if_t<_And<is_class<_Pointer>, _IsFancyPointer<_Pointer>>::value>>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __decay_t<
-  decltype(__to_address_helper<_Pointer>::__call(declval<const _Pointer&>()))>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __decay_t<decltype(__to_address_helper<_Pointer>::__call(declval<const _Pointer&>()))>
 __to_address(const _Pointer& __p) noexcept
 {
   return __to_address_helper<_Pointer>::__call(__p);
@@ -322,8 +321,7 @@ __to_address(const _Pointer& __p) noexcept
 template <class _Pointer, class>
 struct __to_address_helper
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr static decltype(_CUDA_VSTD::__to_address(
-    declval<const _Pointer&>().operator->()))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr static decltype(_CUDA_VSTD::__to_address(declval<const _Pointer&>().operator->()))
   __call(const _Pointer& __p) noexcept
   {
     return _CUDA_VSTD::__to_address(__p.operator->());
@@ -333,8 +331,7 @@ struct __to_address_helper
 template <class _Pointer>
 struct __to_address_helper<_Pointer, decltype((void) pointer_traits<_Pointer>::to_address(declval<const _Pointer&>()))>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr static decltype(pointer_traits<_Pointer>::to_address(
-    declval<const _Pointer&>()))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr static decltype(pointer_traits<_Pointer>::to_address(declval<const _Pointer&>()))
   __call(const _Pointer& __p) noexcept
   {
     return pointer_traits<_Pointer>::to_address(__p);
@@ -343,13 +340,13 @@ struct __to_address_helper<_Pointer, decltype((void) pointer_traits<_Pointer>::t
 
 #if _CCCL_STD_VER > 2011
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr auto to_address(_Tp* __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto to_address(_Tp* __p) noexcept
 {
   return _CUDA_VSTD::__to_address(__p);
 }
 
 template <class _Pointer>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto
 to_address(const _Pointer& __p) noexcept -> decltype(_CUDA_VSTD::__to_address(__p))
 {
   return _CUDA_VSTD::__to_address(__p);
diff --git a/libcudacxx/include/cuda/std/__memory/temporary_buffer.h b/libcudacxx/include/cuda/std/__memory/temporary_buffer.h
index 869b3851fd..37f64befac 100644
--- a/libcudacxx/include/cuda/std/__memory/temporary_buffer.h
+++ b/libcudacxx/include/cuda/std/__memory/temporary_buffer.h
@@ -25,16 +25,16 @@
 #include <cuda/std/__iterator/iterator.h>
 #include <cuda/std/__iterator/iterator_traits.h>
 #include <cuda/std/__memory/addressof.h>
+#include <cuda/std/__new_>
 #include <cuda/std/__utility/move.h>
 #include <cuda/std/__utility/pair.h>
 #include <cuda/std/cstddef>
 #include <cuda/std/detail/libcxx/include/limits>
-#include <cuda/std/detail/libcxx/include/new>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_NO_CFI _LIBCUDACXX_INLINE_VISIBILITY pair<_Tp*, ptrdiff_t>
+_CCCL_NODISCARD _LIBCUDACXX_NO_CFI _LIBCUDACXX_HIDE_FROM_ABI pair<_Tp*, ptrdiff_t>
 get_temporary_buffer(ptrdiff_t __n) noexcept
 {
   pair<_Tp*, ptrdiff_t> __r(0, 0);
@@ -78,7 +78,7 @@ get_temporary_buffer(ptrdiff_t __n) noexcept
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY void return_temporary_buffer(_Tp* __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void return_temporary_buffer(_Tp* __p) noexcept
 {
   _CUDA_VSTD::__libcpp_deallocate_unsized((void*) __p, _LIBCUDACXX_ALIGNOF(_Tp));
 }
diff --git a/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h b/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h
index 99eb2b725d..3338d3b1f0 100644
--- a/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h
+++ b/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h
@@ -32,6 +32,7 @@
 #include <cuda/std/__memory/construct_at.h>
 #include <cuda/std/__memory/pointer_traits.h>
 #include <cuda/std/__memory/voidify.h>
+#include <cuda/std/__new_>
 #include <cuda/std/__type_traits/extent.h>
 #include <cuda/std/__type_traits/is_array.h>
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
@@ -46,14 +47,13 @@
 #include <cuda/std/__utility/exception_guard.h>
 #include <cuda/std/__utility/move.h>
 #include <cuda/std/__utility/pair.h>
-#include <cuda/std/detail/libcxx/include/new>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 struct __always_false
 {
   template <class... _Args>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Args&&...) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Args&&...) const noexcept
   {
     return false;
   }
@@ -65,12 +65,12 @@ struct __simple_rollback
   _ForwardIterator& __first_;
   _ForwardIterator& __current_;
 
-  _LIBCUDACXX_INLINE_VISIBILITY __simple_rollback(_ForwardIterator& __first, _ForwardIterator& __current)
+  _LIBCUDACXX_HIDE_FROM_ABI __simple_rollback(_ForwardIterator& __first, _ForwardIterator& __current)
       : __first_(__first)
       , __current_(__current)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY void operator()() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void operator()() const noexcept
   {
     _CUDA_VSTD::__destroy(__first_, __current_);
   }
@@ -79,8 +79,7 @@ struct __simple_rollback
 // uninitialized_copy
 
 template <class _ValueType, class _InputIterator, class _Sentinel1, class _ForwardIterator, class _EndPredicate>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY pair<_InputIterator, _ForwardIterator>
-__uninitialized_copy(
+_LIBCUDACXX_HIDE_FROM_ABI pair<_InputIterator, _ForwardIterator> __uninitialized_copy(
   _InputIterator __ifirst, _Sentinel1 __ilast, _ForwardIterator __ofirst, _EndPredicate __stop_copying)
 {
   _ForwardIterator __idx = __ofirst;
@@ -95,7 +94,7 @@ __uninitialized_copy(
 }
 
 template <class _InputIterator, class _ForwardIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator
 uninitialized_copy(_InputIterator __ifirst, _InputIterator __ilast, _ForwardIterator __ofirst)
 {
   typedef typename iterator_traits<_ForwardIterator>::value_type _ValueType;
@@ -107,7 +106,7 @@ uninitialized_copy(_InputIterator __ifirst, _InputIterator __ilast, _ForwardIter
 // uninitialized_copy_n
 
 template <class _ValueType, class _InputIterator, class _Size, class _ForwardIterator, class _EndPredicate>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY pair<_InputIterator, _ForwardIterator>
+_LIBCUDACXX_HIDE_FROM_ABI pair<_InputIterator, _ForwardIterator>
 __uninitialized_copy_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __ofirst, _EndPredicate __stop_copying)
 {
   _ForwardIterator __idx = __ofirst;
@@ -122,7 +121,7 @@ __uninitialized_copy_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __of
 }
 
 template <class _InputIterator, class _Size, class _ForwardIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator
 uninitialized_copy_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __ofirst)
 {
   typedef typename iterator_traits<_ForwardIterator>::value_type _ValueType;
@@ -134,7 +133,7 @@ uninitialized_copy_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __ofir
 // uninitialized_fill
 
 template <class _ValueType, class _ForwardIterator, class _Sentinel, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator
 __uninitialized_fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __x)
 {
   _ForwardIterator __idx = __first;
@@ -149,8 +148,7 @@ __uninitialized_fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __x)
 }
 
 template <class _ForwardIterator, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void
-uninitialized_fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __x)
+_LIBCUDACXX_HIDE_FROM_ABI void uninitialized_fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __x)
 {
   typedef typename iterator_traits<_ForwardIterator>::value_type _ValueType;
   (void) _CUDA_VSTD::__uninitialized_fill<_ValueType>(__first, __last, __x);
@@ -159,8 +157,7 @@ uninitialized_fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp&
 // uninitialized_fill_n
 
 template <class _ValueType, class _ForwardIterator, class _Size, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
-__uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator __uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x)
 {
   _ForwardIterator __idx = __first;
   auto __guard           = __make_exception_guard(__simple_rollback<_ForwardIterator>{__first, __idx});
@@ -174,8 +171,7 @@ __uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x)
 }
 
 template <class _ForwardIterator, class _Size, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
-uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x)
 {
   typedef typename iterator_traits<_ForwardIterator>::value_type _ValueType;
   return _CUDA_VSTD::__uninitialized_fill_n<_ValueType>(__first, __n, __x);
@@ -184,8 +180,7 @@ uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x)
 // uninitialized_default_construct
 
 template <class _ValueType, class _ForwardIterator, class _Sentinel>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
-__uninitialized_default_construct(_ForwardIterator __first, _Sentinel __last)
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator __uninitialized_default_construct(_ForwardIterator __first, _Sentinel __last)
 {
   auto __idx   = __first;
   auto __guard = __make_exception_guard(__simple_rollback<_ForwardIterator>{__first, __idx});
@@ -199,8 +194,7 @@ __uninitialized_default_construct(_ForwardIterator __first, _Sentinel __last)
 }
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void
-uninitialized_default_construct(_ForwardIterator __first, _ForwardIterator __last)
+_LIBCUDACXX_HIDE_FROM_ABI void uninitialized_default_construct(_ForwardIterator __first, _ForwardIterator __last)
 {
   using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
   (void) _CUDA_VSTD::__uninitialized_default_construct<_ValueType>(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last));
@@ -209,8 +203,7 @@ uninitialized_default_construct(_ForwardIterator __first, _ForwardIterator __las
 // uninitialized_default_construct_n
 
 template <class _ValueType, class _ForwardIterator, class _Size>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
-__uninitialized_default_construct_n(_ForwardIterator __first, _Size __n)
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator __uninitialized_default_construct_n(_ForwardIterator __first, _Size __n)
 {
   auto __idx   = __first;
   auto __guard = __make_exception_guard(__simple_rollback<_ForwardIterator>{__first, __idx});
@@ -224,8 +217,7 @@ __uninitialized_default_construct_n(_ForwardIterator __first, _Size __n)
 }
 
 template <class _ForwardIterator, class _Size>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
-uninitialized_default_construct_n(_ForwardIterator __first, _Size __n)
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator uninitialized_default_construct_n(_ForwardIterator __first, _Size __n)
 {
   using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
   return _CUDA_VSTD::__uninitialized_default_construct_n<_ValueType>(_CUDA_VSTD::move(__first), __n);
@@ -234,8 +226,7 @@ uninitialized_default_construct_n(_ForwardIterator __first, _Size __n)
 // uninitialized_value_construct
 
 template <class _ValueType, class _ForwardIterator, class _Sentinel>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
-__uninitialized_value_construct(_ForwardIterator __first, _Sentinel __last)
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator __uninitialized_value_construct(_ForwardIterator __first, _Sentinel __last)
 {
   auto __idx   = __first;
   auto __guard = __make_exception_guard(__simple_rollback<_ForwardIterator>{__first, __idx});
@@ -249,8 +240,7 @@ __uninitialized_value_construct(_ForwardIterator __first, _Sentinel __last)
 }
 
 template <class _ForwardIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void
-uninitialized_value_construct(_ForwardIterator __first, _ForwardIterator __last)
+_LIBCUDACXX_HIDE_FROM_ABI void uninitialized_value_construct(_ForwardIterator __first, _ForwardIterator __last)
 {
   using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
   (void) _CUDA_VSTD::__uninitialized_value_construct<_ValueType>(_CUDA_VSTD::move(__first), _CUDA_VSTD::move(__last));
@@ -259,8 +249,7 @@ uninitialized_value_construct(_ForwardIterator __first, _ForwardIterator __last)
 // uninitialized_value_construct_n
 
 template <class _ValueType, class _ForwardIterator, class _Size>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
-__uninitialized_value_construct_n(_ForwardIterator __first, _Size __n)
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator __uninitialized_value_construct_n(_ForwardIterator __first, _Size __n)
 {
   auto __idx   = __first;
   auto __guard = __make_exception_guard(__simple_rollback<_ForwardIterator>{__first, __idx});
@@ -274,8 +263,7 @@ __uninitialized_value_construct_n(_ForwardIterator __first, _Size __n)
 }
 
 template <class _ForwardIterator, class _Size>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
-uninitialized_value_construct_n(_ForwardIterator __first, _Size __n)
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator uninitialized_value_construct_n(_ForwardIterator __first, _Size __n)
 {
   using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
   return _CUDA_VSTD::__uninitialized_value_construct_n<_ValueType>(_CUDA_VSTD::move(__first), __n);
@@ -289,8 +277,8 @@ template <class _ValueType,
           class _Sentinel1,
           class _ForwardIterator,
           class _EndPredicate>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY pair<_InputIterator, _ForwardIterator>
-__uninitialized_move(_InputIterator __ifirst, _Sentinel1 __ilast, _ForwardIterator __ofirst, _EndPredicate __stop_moving)
+_LIBCUDACXX_HIDE_FROM_ABI pair<_InputIterator, _ForwardIterator> __uninitialized_move(
+  _InputIterator __ifirst, _Sentinel1 __ilast, _ForwardIterator __ofirst, _EndPredicate __stop_moving)
 {
   auto __idx   = __ofirst;
   auto __guard = __make_exception_guard(__simple_rollback<_ForwardIterator>{__ofirst, __idx});
@@ -304,7 +292,7 @@ __uninitialized_move(_InputIterator __ifirst, _Sentinel1 __ilast, _ForwardIterat
 }
 
 template <class _InputIterator, class _ForwardIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator
 uninitialized_move(_InputIterator __ifirst, _InputIterator __ilast, _ForwardIterator __ofirst)
 {
   using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
@@ -316,7 +304,7 @@ uninitialized_move(_InputIterator __ifirst, _InputIterator __ilast, _ForwardIter
 // uninitialized_move_n
 
 template <class _ValueType, class _IterOps, class _InputIterator, class _Size, class _ForwardIterator, class _EndPredicate>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY pair<_InputIterator, _ForwardIterator>
+_LIBCUDACXX_HIDE_FROM_ABI pair<_InputIterator, _ForwardIterator>
 __uninitialized_move_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __ofirst, _EndPredicate __stop_moving)
 {
   auto __idx   = __ofirst;
@@ -331,7 +319,7 @@ __uninitialized_move_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __of
 }
 
 template <class _InputIterator, class _Size, class _ForwardIterator>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY pair<_InputIterator, _ForwardIterator>
+_LIBCUDACXX_HIDE_FROM_ABI pair<_InputIterator, _ForwardIterator>
 uninitialized_move_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __ofirst)
 {
   using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
@@ -347,7 +335,7 @@ uninitialized_move_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __ofir
 // This function assumes that destructors do not throw, and that the allocator is bound to
 // the correct type.
 template <class _Alloc, class _BidirIter, __enable_if_t<__is_cpp17_bidirectional_iterator<_BidirIter>::value, int> = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __allocator_destroy_multidimensional(_Alloc& __alloc, _BidirIter __first, _BidirIter __last) noexcept
 {
   using _ValueType = typename iterator_traits<_BidirIter>::value_type;
@@ -391,7 +379,7 @@ __allocator_destroy_multidimensional(_Alloc& __alloc, _BidirIter __first, _Bidir
 //
 // This function assumes that the allocator is bound to the correct type.
 template <class _Alloc, class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __allocator_construct_at_multidimensional(_Alloc& __alloc, _Tp* __loc)
 {
   static_assert(_CCCL_TRAIT(is_same, typename allocator_traits<_Alloc>::value_type, _Tp),
@@ -432,7 +420,7 @@ __allocator_construct_at_multidimensional(_Alloc& __alloc, _Tp* __loc)
 //
 // This function assumes that the allocator is bound to the correct type.
 template <class _Alloc, class _Tp, class _Arg>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __allocator_construct_at_multidimensional(_Alloc& __alloc, _Tp* __loc, _Arg const& __arg)
 {
   static_assert(_CCCL_TRAIT(is_same, typename allocator_traits<_Alloc>::value_type, _Tp),
@@ -474,7 +462,7 @@ __allocator_construct_at_multidimensional(_Alloc& __alloc, _Tp* __loc, _Arg cons
 // initialization using allocator_traits destruction. If the elements in the range are C-style
 // arrays, they are initialized element-wise using allocator construction, and recursively so.
 template <class _Alloc, class _BidirIter, class _Tp, class _Size = typename iterator_traits<_BidirIter>::difference_type>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __uninitialized_allocator_fill_n_multidimensional(_Alloc& __alloc, _BidirIter __it, _Size __n, _Tp const& __value)
 {
   using _ValueType = typename iterator_traits<_BidirIter>::value_type;
@@ -495,7 +483,7 @@ __uninitialized_allocator_fill_n_multidimensional(_Alloc& __alloc, _BidirIter __
 // Same as __uninitialized_allocator_fill_n_multidimensional, but doesn't pass any initialization argument
 // to the allocator's construct method, which results in value initialization.
 template <class _Alloc, class _BidirIter, class _Size = typename iterator_traits<_BidirIter>::difference_type>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 __uninitialized_allocator_value_construct_n_multidimensional(_Alloc& __alloc, _BidirIter __it, _Size __n)
 {
   using _ValueType = typename iterator_traits<_BidirIter>::value_type;
@@ -515,8 +503,7 @@ __uninitialized_allocator_value_construct_n_multidimensional(_Alloc& __alloc, _B
 
 // Destroy all elements in [__first, __last) from left to right using allocator destruction.
 template <class _Alloc, class _Iter, class _Sent>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
-__allocator_destroy(_Alloc& __alloc, _Iter __first, _Sent __last)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __allocator_destroy(_Alloc& __alloc, _Iter __first, _Sent __last)
 {
   for (; __first != __last; ++__first)
   {
@@ -528,14 +515,14 @@ template <class _Alloc, class _Iter>
 class _AllocatorDestroyRangeReverse
 {
 public:
-  inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   _AllocatorDestroyRangeReverse(_Alloc& __alloc, _Iter& __first, _Iter& __last)
       : __alloc_(__alloc)
       , __first_(__first)
       , __last_(__last)
   {}
 
-  inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator()() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator()() const
   {
     _CUDA_VSTD::__allocator_destroy(
       __alloc_, _CUDA_VSTD::reverse_iterator<_Iter>(__last_), _CUDA_VSTD::reverse_iterator<_Iter>(__first_));
@@ -552,7 +539,7 @@ class _AllocatorDestroyRangeReverse
 // The caller has to ensure that __first2 can hold at least N uninitialized elements. If an exception is thrown the
 // already copied elements are destroyed in reverse order of their construction.
 template <class _Alloc, class _Iter1, class _Sent1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _Iter2
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Iter2
 __uninitialized_allocator_copy_impl(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2)
 {
   auto __destruct_first = __first2;
@@ -586,7 +573,7 @@ template <
     _CCCL_TRAIT(is_trivially_copy_constructible, _RawTypeIn) && _CCCL_TRAIT(is_trivially_copy_assignable, _RawTypeIn)
     && _CCCL_TRAIT(is_same, __remove_const_t<_In>, __remove_const_t<_Out>)
     && __allocator_has_trivial_copy_construct<_Alloc, _RawTypeIn>::value>* = nullptr>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _Out*
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Out*
 __uninitialized_allocator_copy_impl(_Alloc&, _In* __first1, _In* __last1, _Out* __first2)
 {
   if (__libcpp_is_constant_evaluated())
@@ -606,7 +593,7 @@ __uninitialized_allocator_copy_impl(_Alloc&, _In* __first1, _In* __last1, _Out*
 }
 
 template <class _Alloc, class _Iter1, class _Sent1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _Iter2
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Iter2
 __uninitialized_allocator_copy(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2)
 {
   auto __unwrapped_range = _CUDA_VSTD::__unwrap_range(__first1, __last1);
@@ -621,7 +608,7 @@ __uninitialized_allocator_copy(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1,
 // Otherwise try to copy all elements. If an exception is thrown the already copied
 // elements are destroyed in reverse order of their construction.
 template <class _Alloc, class _Iter1, class _Sent1, class _Iter2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _Iter2
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Iter2
 __uninitialized_allocator_move_if_noexcept(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2)
 {
   static_assert(__is_cpp17_move_insertable<_Alloc>::value,
@@ -631,12 +618,12 @@ __uninitialized_allocator_move_if_noexcept(_Alloc& __alloc, _Iter1 __first1, _Se
     _AllocatorDestroyRangeReverse<_Alloc, _Iter2>(__alloc, __destruct_first, __first2));
   while (__first1 != __last1)
   {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
     allocator_traits<_Alloc>::construct(
       __alloc, _CUDA_VSTD::__to_address(__first2), _CUDA_VSTD::move_if_noexcept(*__first1));
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
     allocator_traits<_Alloc>::construct(__alloc, _CUDA_VSTD::__to_address(__first2), _CUDA_VSTD::move(*__first1));
-#endif
+#endif // _CCCL_NO_EXCEPTIONS
     ++__first1;
     ++__first2;
   }
@@ -660,7 +647,7 @@ template <class _Alloc,
           class       = __enable_if_t<_CCCL_TRAIT(is_trivially_move_constructible, _Type)
                                       && _CCCL_TRAIT(is_trivially_move_assignable, _Type)
                                       && __allocator_has_trivial_move_construct<_Alloc, _Type>::value>>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _Iter2
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Iter2
 __uninitialized_allocator_move_if_noexcept(_Alloc&, _Iter1 __first1, _Iter1 __last1, _Iter2 __first2)
 {
   if (__libcpp_is_constant_evaluated())
diff --git a/libcudacxx/include/cuda/std/__memory/unique_ptr.h b/libcudacxx/include/cuda/std/__memory/unique_ptr.h
index 34290623e7..de325f6aeb 100644
--- a/libcudacxx/include/cuda/std/__memory/unique_ptr.h
+++ b/libcudacxx/include/cuda/std/__memory/unique_ptr.h
@@ -55,19 +55,18 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS default_delete
+struct _CCCL_TYPE_VISIBILITY_DEFAULT default_delete
 {
   static_assert(!_CCCL_TRAIT(is_function, _Tp), "default_delete cannot be instantiated for function types");
 
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr default_delete() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr default_delete() noexcept = default;
 
   template <class _Up>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20
   default_delete(const default_delete<_Up>&, __enable_if_t<_CCCL_TRAIT(is_convertible, _Up*, _Tp*), int> = 0) noexcept
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
-  operator()(_Tp* __ptr) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void operator()(_Tp* __ptr) const noexcept
   {
     static_assert(sizeof(_Tp) >= 0, "cannot delete an incomplete type");
     static_assert(!is_void<_Tp>::value, "cannot delete an incomplete type");
@@ -76,18 +75,17 @@ struct _LIBCUDACXX_TEMPLATE_VIS default_delete
 };
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS default_delete<_Tp[]>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT default_delete<_Tp[]>
 {
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr default_delete() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr default_delete() noexcept = default;
 
   template <class _Up>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 default_delete(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 default_delete(
     const default_delete<_Up[]>&, __enable_if_t<_CCCL_TRAIT(is_convertible, _Up (*)[], _Tp (*)[]), int> = 0) noexcept
   {}
 
   template <class _Up>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX20 __enable_if_t<_CCCL_TRAIT(is_convertible, _Up (*)[], _Tp (*)[]), void>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __enable_if_t<_CCCL_TRAIT(is_convertible, _Up (*)[], _Tp (*)[]), void>
   operator()(_Up* __ptr) const noexcept
   {
     static_assert(sizeof(_Up) >= 0, "cannot delete an incomplete type");
@@ -127,7 +125,7 @@ struct __unique_ptr_deleter_sfinae<_Deleter&>
 #endif
 
 template <class _Tp, class _Dp = default_delete<_Tp>>
-class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr
+class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _CCCL_TYPE_VISIBILITY_DEFAULT unique_ptr
 {
 public:
   typedef _Tp element_type;
@@ -180,39 +178,36 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr
 
 public:
   template <bool _Dummy = true, class = _EnableIfDeleterDefaultConstructible<_Dummy>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr unique_ptr() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr unique_ptr() noexcept
       : __ptr_(__value_init_tag(), __value_init_tag())
   {}
 
   template <bool _Dummy = true, class = _EnableIfDeleterDefaultConstructible<_Dummy>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr unique_ptr(nullptr_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr unique_ptr(nullptr_t) noexcept
       : __ptr_(__value_init_tag(), __value_init_tag())
   {}
 
   template <bool _Dummy = true, class = _EnableIfDeleterDefaultConstructible<_Dummy>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX20 explicit unique_ptr(pointer __p) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit unique_ptr(pointer __p) noexcept
       : __ptr_(__p, __value_init_tag())
   {}
 
   template <bool _Dummy = true, class = _EnableIfDeleterConstructible<_LValRefType<_Dummy>>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  unique_ptr(pointer __p, _LValRefType<_Dummy> __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(pointer __p, _LValRefType<_Dummy> __d) noexcept
       : __ptr_(__p, __d)
   {}
 
   template <bool _Dummy = true, class = _EnableIfDeleterConstructible<_GoodRValRefType<_Dummy>>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  unique_ptr(pointer __p, _GoodRValRefType<_Dummy> __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(pointer __p, _GoodRValRefType<_Dummy> __d) noexcept
       : __ptr_(__p, _CUDA_VSTD::move(__d))
   {
     static_assert(!is_reference<deleter_type>::value, "rvalue deleter bound to reference");
   }
 
   template <bool _Dummy = true, class = _EnableIfDeleterConstructible<_BadRValRefType<_Dummy>>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY unique_ptr(pointer __p, _BadRValRefType<_Dummy> __d) = delete;
+  _LIBCUDACXX_HIDE_FROM_ABI unique_ptr(pointer __p, _BadRValRefType<_Dummy> __d) = delete;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 unique_ptr(unique_ptr&& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(unique_ptr&& __u) noexcept
       : __ptr_(__u.release(), _CUDA_VSTD::forward<deleter_type>(__u.get_deleter()))
   {}
 
@@ -220,13 +215,11 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr
             class _Ep,
             class = _EnableIfMoveConvertible<unique_ptr<_Up, _Ep>, _Up>,
             class = _EnableIfDeleterConvertible<_Ep>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  unique_ptr(unique_ptr<_Up, _Ep>&& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(unique_ptr<_Up, _Ep>&& __u) noexcept
       : __ptr_(__u.release(), _CUDA_VSTD::forward<_Ep>(__u.get_deleter()))
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 unique_ptr&
-  operator=(unique_ptr&& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr& operator=(unique_ptr&& __u) noexcept
   {
     reset(__u.release());
     __ptr_.second() = _CUDA_VSTD::forward<deleter_type>(__u.get_deleter());
@@ -237,8 +230,7 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr
             class _Ep,
             class = _EnableIfMoveConvertible<unique_ptr<_Up, _Ep>, _Up>,
             class = _EnableIfDeleterAssignable<_Ep>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 unique_ptr&
-  operator=(unique_ptr<_Up, _Ep>&& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr& operator=(unique_ptr<_Up, _Ep>&& __u) noexcept
   {
     reset(__u.release());
     __ptr_.second() = _CUDA_VSTD::forward<_Ep>(__u.get_deleter());
@@ -248,53 +240,50 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr
   unique_ptr(unique_ptr const&)            = delete;
   unique_ptr& operator=(unique_ptr const&) = delete;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~unique_ptr()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~unique_ptr()
   {
     reset();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 unique_ptr& operator=(nullptr_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr& operator=(nullptr_t) noexcept
   {
     reset();
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __add_lvalue_reference_t<_Tp>
-  operator*() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __add_lvalue_reference_t<_Tp> operator*() const
   {
     return *__ptr_.first();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 pointer operator->() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 pointer operator->() const noexcept
   {
     return __ptr_.first();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 pointer get() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 pointer get() const noexcept
   {
     return __ptr_.first();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 deleter_type& get_deleter() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 deleter_type& get_deleter() noexcept
   {
     return __ptr_.second();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 const deleter_type&
-  get_deleter() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 const deleter_type& get_deleter() const noexcept
   {
     return __ptr_.second();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 explicit operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit operator bool() const noexcept
   {
     return __ptr_.first() != nullptr;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 pointer release() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 pointer release() noexcept
   {
     pointer __t    = __ptr_.first();
     __ptr_.first() = pointer();
     return __t;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
-  reset(pointer __p = pointer()) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void reset(pointer __p = pointer()) noexcept
   {
     pointer __tmp  = __ptr_.first();
     __ptr_.first() = __p;
@@ -304,14 +293,14 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void swap(unique_ptr& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void swap(unique_ptr& __u) noexcept
   {
     __ptr_.swap(__u.__ptr_);
   }
 };
 
 template <class _Tp, class _Dp>
-class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
+class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _CCCL_TYPE_VISIBILITY_DEFAULT unique_ptr<_Tp[], _Dp>
 {
 public:
   typedef _Tp element_type;
@@ -374,12 +363,12 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
 
 public:
   template <bool _Dummy = true, class = _EnableIfDeleterDefaultConstructible<_Dummy>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr unique_ptr() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr unique_ptr() noexcept
       : __ptr_(__value_init_tag(), __value_init_tag())
   {}
 
   template <bool _Dummy = true, class = _EnableIfDeleterDefaultConstructible<_Dummy>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr unique_ptr(nullptr_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr unique_ptr(nullptr_t) noexcept
       : __ptr_(__value_init_tag(), __value_init_tag())
   {}
 
@@ -387,7 +376,7 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
             bool _Dummy = true,
             class       = _EnableIfDeleterDefaultConstructible<_Dummy>,
             class       = _EnableIfPointerConvertible<_Pp>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 explicit unique_ptr(_Pp __p) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit unique_ptr(_Pp __p) noexcept
       : __ptr_(__p, __value_init_tag())
   {}
 
@@ -395,14 +384,12 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
             bool _Dummy = true,
             class       = _EnableIfDeleterConstructible<_LValRefType<_Dummy>>,
             class       = _EnableIfPointerConvertible<_Pp>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  unique_ptr(_Pp __p, _LValRefType<_Dummy> __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(_Pp __p, _LValRefType<_Dummy> __d) noexcept
       : __ptr_(__p, __d)
   {}
 
   template <bool _Dummy = true, class = _EnableIfDeleterConstructible<_LValRefType<_Dummy>>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  unique_ptr(nullptr_t, _LValRefType<_Dummy> __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(nullptr_t, _LValRefType<_Dummy> __d) noexcept
       : __ptr_(nullptr, __d)
   {}
 
@@ -410,16 +397,14 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
             bool _Dummy = true,
             class       = _EnableIfDeleterConstructible<_GoodRValRefType<_Dummy>>,
             class       = _EnableIfPointerConvertible<_Pp>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  unique_ptr(_Pp __p, _GoodRValRefType<_Dummy> __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(_Pp __p, _GoodRValRefType<_Dummy> __d) noexcept
       : __ptr_(__p, _CUDA_VSTD::move(__d))
   {
     static_assert(!is_reference<deleter_type>::value, "rvalue deleter bound to reference");
   }
 
   template <bool _Dummy = true, class = _EnableIfDeleterConstructible<_GoodRValRefType<_Dummy>>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  unique_ptr(nullptr_t, _GoodRValRefType<_Dummy> __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(nullptr_t, _GoodRValRefType<_Dummy> __d) noexcept
       : __ptr_(nullptr, _CUDA_VSTD::move(__d))
   {
     static_assert(!is_reference<deleter_type>::value, "rvalue deleter bound to reference");
@@ -429,14 +414,13 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
             bool _Dummy = true,
             class       = _EnableIfDeleterConstructible<_BadRValRefType<_Dummy>>,
             class       = _EnableIfPointerConvertible<_Pp>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY unique_ptr(_Pp __p, _BadRValRefType<_Dummy> __d) = delete;
+  _LIBCUDACXX_HIDE_FROM_ABI unique_ptr(_Pp __p, _BadRValRefType<_Dummy> __d) = delete;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 unique_ptr(unique_ptr&& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(unique_ptr&& __u) noexcept
       : __ptr_(__u.release(), _CUDA_VSTD::forward<deleter_type>(__u.get_deleter()))
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 unique_ptr&
-  operator=(unique_ptr&& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr& operator=(unique_ptr&& __u) noexcept
   {
     reset(__u.release());
     __ptr_.second() = _CUDA_VSTD::forward<deleter_type>(__u.get_deleter());
@@ -447,8 +431,7 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
             class _Ep,
             class = _EnableIfMoveConvertible<unique_ptr<_Up, _Ep>, _Up>,
             class = _EnableIfDeleterConvertible<_Ep>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-  unique_ptr(unique_ptr<_Up, _Ep>&& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr(unique_ptr<_Up, _Ep>&& __u) noexcept
       : __ptr_(__u.release(), _CUDA_VSTD::forward<_Ep>(__u.get_deleter()))
   {}
 
@@ -456,8 +439,7 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
             class _Ep,
             class = _EnableIfMoveConvertible<unique_ptr<_Up, _Ep>, _Up>,
             class = _EnableIfDeleterAssignable<_Ep>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 unique_ptr&
-  operator=(unique_ptr<_Up, _Ep>&& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr& operator=(unique_ptr<_Up, _Ep>&& __u) noexcept
   {
     reset(__u.release());
     __ptr_.second() = _CUDA_VSTD::forward<_Ep>(__u.get_deleter());
@@ -465,43 +447,41 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
   }
 
 public:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~unique_ptr()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~unique_ptr()
   {
     reset();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 unique_ptr& operator=(nullptr_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 unique_ptr& operator=(nullptr_t) noexcept
   {
     reset();
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __add_lvalue_reference_t<_Tp>
-  operator[](size_t __i) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __add_lvalue_reference_t<_Tp> operator[](size_t __i) const
   {
     return __ptr_.first()[__i];
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 pointer get() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 pointer get() const noexcept
   {
     return __ptr_.first();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 deleter_type& get_deleter() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 deleter_type& get_deleter() noexcept
   {
     return __ptr_.second();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 const deleter_type&
-  get_deleter() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 const deleter_type& get_deleter() const noexcept
   {
     return __ptr_.second();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 explicit operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit operator bool() const noexcept
   {
     return __ptr_.first() != nullptr;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 pointer release() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 pointer release() noexcept
   {
     pointer __t    = __ptr_.first();
     __ptr_.first() = pointer();
@@ -509,7 +489,7 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
   }
 
   template <class _Pp, __enable_if_t<_CheckArrayPointerConversion<_Pp>::value, int> = 0>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void reset(_Pp __p) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void reset(_Pp __p) noexcept
   {
     pointer __tmp  = __ptr_.first();
     __ptr_.first() = __p;
@@ -519,7 +499,7 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void reset(nullptr_t = nullptr) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void reset(nullptr_t = nullptr) noexcept
   {
     pointer __tmp  = __ptr_.first();
     __ptr_.first() = nullptr;
@@ -529,22 +509,21 @@ class _LIBCUDACXX_UNIQUE_PTR_TRIVIAL_ABI _LIBCUDACXX_TEMPLATE_VIS unique_ptr<_Tp
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void swap(unique_ptr& __u) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void swap(unique_ptr& __u) noexcept
   {
     __ptr_.swap(__u.__ptr_);
   }
 };
 
 template <class _Tp, class _Dp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX20 __enable_if_t<__is_swappable<_Dp>::value, void>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __enable_if_t<__is_swappable<_Dp>::value, void>
 swap(unique_ptr<_Tp, _Dp>& __x, unique_ptr<_Tp, _Dp>& __y) noexcept
 {
   __x.swap(__y);
 }
 
 template <class _T1, class _D1, class _T2, class _D2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool
 operator==(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y)
 {
   return __x.get() == __y.get();
@@ -552,8 +531,7 @@ operator==(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y)
 
 #if _CCCL_STD_VER <= 2017
 template <class _T1, class _D1, class _T2, class _D2>
-inline
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 
   bool
   operator!=(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y)
@@ -563,8 +541,7 @@ inline
 #endif
 
 template <class _T1, class _D1, class _T2, class _D2>
-inline
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 
   bool
   operator<(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y)
@@ -576,8 +553,7 @@ inline
 }
 
 template <class _T1, class _D1, class _T2, class _D2>
-inline
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 
   bool
   operator>(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y)
@@ -586,8 +562,7 @@ inline
 }
 
 template <class _T1, class _D1, class _T2, class _D2>
-inline
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 
   bool
   operator<=(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y)
@@ -596,8 +571,7 @@ inline
 }
 
 template <class _T1, class _D1, class _T2, class _D2>
-inline
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 
   bool
   operator>=(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y)
@@ -619,16 +593,14 @@ operator<=>(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y)
 #endif // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
 template <class _T1, class _D1>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator==(const unique_ptr<_T1, _D1>& __x, nullptr_t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator==(const unique_ptr<_T1, _D1>& __x, nullptr_t) noexcept
 {
   return !__x;
 }
 
 #if _CCCL_STD_VER <= 2017
 template <class _T1, class _D1>
-inline
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 
   bool
   operator==(nullptr_t, const unique_ptr<_T1, _D1>& __x) noexcept
@@ -637,8 +609,7 @@ inline
 }
 
 template <class _T1, class _D1>
-inline
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 
   bool
   operator!=(const unique_ptr<_T1, _D1>& __x, nullptr_t) noexcept
@@ -647,8 +618,7 @@ inline
 }
 
 template <class _T1, class _D1>
-inline
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 
   bool
   operator!=(nullptr_t, const unique_ptr<_T1, _D1>& __x) noexcept
@@ -658,59 +628,51 @@ inline
 #endif // _CCCL_STD_VER <= 2017
 
 template <class _T1, class _D1>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator<(const unique_ptr<_T1, _D1>& __x, nullptr_t)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator<(const unique_ptr<_T1, _D1>& __x, nullptr_t)
 {
   typedef typename unique_ptr<_T1, _D1>::pointer _P1;
   return less<_P1>()(__x.get(), nullptr);
 }
 
 template <class _T1, class _D1>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator<(nullptr_t, const unique_ptr<_T1, _D1>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator<(nullptr_t, const unique_ptr<_T1, _D1>& __x)
 {
   typedef typename unique_ptr<_T1, _D1>::pointer _P1;
   return less<_P1>()(nullptr, __x.get());
 }
 
 template <class _T1, class _D1>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator>(const unique_ptr<_T1, _D1>& __x, nullptr_t)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator>(const unique_ptr<_T1, _D1>& __x, nullptr_t)
 {
   return nullptr < __x;
 }
 
 template <class _T1, class _D1>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator>(nullptr_t, const unique_ptr<_T1, _D1>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator>(nullptr_t, const unique_ptr<_T1, _D1>& __x)
 {
   return __x < nullptr;
 }
 
 template <class _T1, class _D1>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator<=(const unique_ptr<_T1, _D1>& __x, nullptr_t)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator<=(const unique_ptr<_T1, _D1>& __x, nullptr_t)
 {
   return !(nullptr < __x);
 }
 
 template <class _T1, class _D1>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator<=(nullptr_t, const unique_ptr<_T1, _D1>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator<=(nullptr_t, const unique_ptr<_T1, _D1>& __x)
 {
   return !(__x < nullptr);
 }
 
 template <class _T1, class _D1>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator>=(const unique_ptr<_T1, _D1>& __x, nullptr_t)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator>=(const unique_ptr<_T1, _D1>& __x, nullptr_t)
 {
   return !(__x < nullptr);
 }
 
 template <class _T1, class _D1>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 bool
-operator>=(nullptr_t, const unique_ptr<_T1, _D1>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 bool operator>=(nullptr_t, const unique_ptr<_T1, _D1>& __x)
 {
   return !(nullptr < __x);
 }
@@ -719,8 +681,7 @@ operator>=(nullptr_t, const unique_ptr<_T1, _D1>& __x)
 #  if _CCCL_STD_VER >= 2020
 template <class _T1, class _D1>
   requires three_way_comparable<typename unique_ptr<_T1, _D1>::pointer>
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX20 compare_three_way_result_t<typename unique_ptr<_T1, _D1>::pointer>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 compare_three_way_result_t<typename unique_ptr<_T1, _D1>::pointer>
 operator<=>(const unique_ptr<_T1, _D1>& __x, nullptr_t)
 {
   return compare_three_way()(__x.get(), static_cast<typename unique_ptr<_T1, _D1>::pointer>(nullptr));
@@ -747,16 +708,13 @@ struct __unique_if<_Tp[_Np]>
 };
 
 template <class _Tp, class... _Args>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-typename __unique_if<_Tp>::__unique_single
-make_unique(_Args&&... __args)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 typename __unique_if<_Tp>::__unique_single make_unique(_Args&&... __args)
 {
   return unique_ptr<_Tp>(new _Tp(_CUDA_VSTD::forward<_Args>(__args)...));
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
-typename __unique_if<_Tp>::__unique_array_unknown_bound
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 typename __unique_if<_Tp>::__unique_array_unknown_bound
 make_unique(size_t __n)
 {
   typedef __remove_extent_t<_Tp> _Up;
@@ -764,39 +722,38 @@ make_unique(size_t __n)
 }
 
 template <class _Tp, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY typename __unique_if<_Tp>::__unique_array_known_bound make_unique(_Args&&...) = delete;
+_LIBCUDACXX_HIDE_FROM_ABI typename __unique_if<_Tp>::__unique_array_known_bound make_unique(_Args&&...) = delete;
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 typename __unique_if<_Tp>::__unique_single
-make_unique_for_overwrite()
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 typename __unique_if<_Tp>::__unique_single make_unique_for_overwrite()
 {
   return unique_ptr<_Tp>(new _Tp);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 typename __unique_if<_Tp>::__unique_array_unknown_bound
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 typename __unique_if<_Tp>::__unique_array_unknown_bound
 make_unique_for_overwrite(size_t __n)
 {
   return unique_ptr<_Tp>(new __remove_extent_t<_Tp>[__n]);
 }
 
 template <class _Tp, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY typename __unique_if<_Tp>::__unique_array_known_bound
+_LIBCUDACXX_HIDE_FROM_ABI typename __unique_if<_Tp>::__unique_array_known_bound
 make_unique_for_overwrite(_Args&&...) = delete;
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS hash;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash;
 
 #ifndef __cuda_std__
 template <class _Tp, class _Dp>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<unique_ptr<_Tp, _Dp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<unique_ptr<_Tp, _Dp>>
 {
 #  if _CCCL_STD_VER <= 2017 || defined(_LIBCUDACXX_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS)
   _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef unique_ptr<_Tp, _Dp> argument_type;
   _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef size_t result_type;
 #  endif
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(const unique_ptr<_Tp, _Dp>& __ptr) const
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(const unique_ptr<_Tp, _Dp>& __ptr) const
   {
     typedef typename unique_ptr<_Tp, _Dp>::pointer pointer;
     return hash<pointer>()(__ptr.get());
diff --git a/libcudacxx/include/cuda/std/__memory/uses_allocator.h b/libcudacxx/include/cuda/std/__memory/uses_allocator.h
index 789ecc9f4c..5f305a2624 100644
--- a/libcudacxx/include/cuda/std/__memory/uses_allocator.h
+++ b/libcudacxx/include/cuda/std/__memory/uses_allocator.h
@@ -56,7 +56,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool __uses_allocator_v<_Tp, _Alloc, true> =
 #endif // _CCCL_STD_VER >= 2017
 
 template <class _Tp, class _Alloc>
-struct _LIBCUDACXX_TEMPLATE_VIS uses_allocator
+struct _CCCL_TYPE_VISIBILITY_DEFAULT uses_allocator
     : public integral_constant<bool, _CCCL_TRAIT(__uses_allocator, _Tp, _Alloc)>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__memory/voidify.h b/libcudacxx/include/cuda/std/__memory/voidify.h
index 3da310b9b2..68a169dc06 100644
--- a/libcudacxx/include/cuda/std/__memory/voidify.h
+++ b/libcudacxx/include/cuda/std/__memory/voidify.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <typename _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void* __voidify(_Tp& __from)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void* __voidify(_Tp& __from)
 {
   // Cast away cv-qualifiers to allow modifying elements of a range through const iterators.
   return const_cast<void*>(static_cast<const volatile void*>(_CUDA_VSTD::addressof(__from)));
diff --git a/libcudacxx/include/cuda/std/__memory_ b/libcudacxx/include/cuda/std/__memory_
index 3fbaf628d4..0cf9e7f213 100644
--- a/libcudacxx/include/cuda/std/__memory_
+++ b/libcudacxx/include/cuda/std/__memory_
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,10 +21,26 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
+#include <cuda/std/__memory/addressof.h>
+#include <cuda/std/__memory/align.h>
+#include <cuda/std/__memory/allocate_at_least.h>
+#include <cuda/std/__memory/allocation_guard.h>
+#include <cuda/std/__memory/allocator.h>
+#include <cuda/std/__memory/allocator_arg_t.h>
+#include <cuda/std/__memory/allocator_traits.h>
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__memory/pointer_traits.h>
+#include <cuda/std/__memory/uninitialized_algorithms.h>
+#include <cuda/std/__memory/unique_ptr.h>
+#include <cuda/std/__memory/uses_allocator.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 
-#include <cuda/std/detail/libcxx/include/memory>
+// standard-mandated includes
+#include <cuda/std/version>
 
-_CCCL_POP_MACROS
+// [memory.syn]
+#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#  include <cuda/std/compare>
+#endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
 #endif // _CUDA_STD_MEMORY
diff --git a/libcudacxx/include/cuda/std/__new/allocate.h b/libcudacxx/include/cuda/std/__new/allocate.h
index b746c86506..9081dfcf81 100644
--- a/libcudacxx/include/cuda/std/__new/allocate.h
+++ b/libcudacxx/include/cuda/std/__new/allocate.h
@@ -47,7 +47,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr bool __is_overaligned_for_new(size_t __align) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_overaligned_for_new(size_t __align) noexcept
 {
 #ifdef __STDCPP_DEFAULT_NEW_ALIGNMENT__
   return __align > __STDCPP_DEFAULT_NEW_ALIGNMENT__;
@@ -57,7 +57,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr bool __is_overaligned_for_new(siz
 }
 
 template <class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY void* __libcpp_operator_new(_Args... __args)
+_LIBCUDACXX_HIDE_FROM_ABI void* __libcpp_operator_new(_Args... __args)
 {
   // Those builtins are not usable on device and the tests crash when using them
 #if 0 && __has_builtin(__builtin_operator_new) && __has_builtin(__builtin_operator_delete)
@@ -68,7 +68,7 @@ _LIBCUDACXX_INLINE_VISIBILITY void* __libcpp_operator_new(_Args... __args)
 }
 
 template <class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY void __libcpp_operator_delete(_Args... __args)
+_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args)
 {
   // Those builtins are not usable on device and the tests crash when using them
 #if 0 && __has_builtin(__builtin_operator_new) && __has_builtin(__builtin_operator_delete)
@@ -78,7 +78,7 @@ _LIBCUDACXX_INLINE_VISIBILITY void __libcpp_operator_delete(_Args... __args)
 #endif // !__builtin_operator_new || !__builtin_operator_delete
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void* __libcpp_allocate(size_t __size, size_t __align)
+_LIBCUDACXX_HIDE_FROM_ABI void* __libcpp_allocate(size_t __size, size_t __align)
 {
 #ifndef _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION
   if (__is_overaligned_for_new(__align))
@@ -92,7 +92,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY void* __libcpp_allocate(size_t __size, size
 }
 
 template <class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY void __do_deallocate_handle_size(void* __ptr, size_t __size, _Args... __args)
+_LIBCUDACXX_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __size, _Args... __args)
 {
 #ifdef _LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION
   (void) __size;
@@ -102,7 +102,7 @@ _LIBCUDACXX_INLINE_VISIBILITY void __do_deallocate_handle_size(void* __ptr, size
 #endif // !_LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align)
+_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align)
 {
 #ifndef _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION
   if (__is_overaligned_for_new(__align))
@@ -115,7 +115,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY void __libcpp_deallocate(void* __ptr, size_
   return __do_deallocate_handle_size(__ptr, __size);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void __libcpp_deallocate_unsized(void* __ptr, size_t __align)
+_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align)
 {
 #ifndef _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION
   if (__is_overaligned_for_new(__align))
diff --git a/libcudacxx/include/cuda/std/__new/bad_alloc.h b/libcudacxx/include/cuda/std/__new/bad_alloc.h
index d9408bc9b6..23f977fc79 100644
--- a/libcudacxx/include/cuda/std/__new/bad_alloc.h
+++ b/libcudacxx/include/cuda/std/__new/bad_alloc.h
@@ -24,28 +24,28 @@
 
 #include <cuda/std/__exception/terminate.h>
 
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
 #  include <new>
-#endif // _LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_bad_alloc()
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_bad_alloc()
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::std::bad_alloc();), (_CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_bad_array_new_length()
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_bad_array_new_length()
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::std::bad_array_new_length();), (_CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__new/launder.h b/libcudacxx/include/cuda/std/__new/launder.h
index 051c83d3c1..67c8c340bd 100644
--- a/libcudacxx/include/cuda/std/__new/launder.h
+++ b/libcudacxx/include/cuda/std/__new/launder.h
@@ -29,8 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp*
-launder(_Tp* __p) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp* launder(_Tp* __p) noexcept
 {
   static_assert(!_CCCL_TRAIT(is_function, _Tp), "can't launder functions");
   static_assert(!_CCCL_TRAIT(is_same, void, __remove_cv_t<_Tp>), "can't launder cv-void");
diff --git a/libcudacxx/include/cuda/std/__new_ b/libcudacxx/include/cuda/std/__new_
index 482e3cfb23..9d7c1fa853 100644
--- a/libcudacxx/include/cuda/std/__new_
+++ b/libcudacxx/include/cuda/std/__new_
@@ -21,10 +21,10 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
-
-#include <cuda/std/detail/libcxx/include/new>
-
-_CCCL_POP_MACROS
+#include <cuda/std/__new/allocate.h>
+#include <cuda/std/__new/bad_alloc.h>
+#include <cuda/std/__new/launder.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/version>
 
 #endif // _CUDA_STD_NEW
diff --git a/libcudacxx/include/cuda/std/__numeric/accumulate.h b/libcudacxx/include/cuda/std/__numeric/accumulate.h
index 9dc66281e3..0f6b219495 100644
--- a/libcudacxx/include/cuda/std/__numeric/accumulate.h
+++ b/libcudacxx/include/cuda/std/__numeric/accumulate.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp
 accumulate(_InputIterator __first, _InputIterator __last, _Tp __init)
 {
   for (; __first != __last; ++__first)
@@ -38,7 +38,7 @@ accumulate(_InputIterator __first, _InputIterator __last, _Tp __init)
 }
 
 template <class _InputIterator, class _Tp, class _BinaryOperation>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp
 accumulate(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOperation __binary_op)
 {
   for (; __first != __last; ++__first)
diff --git a/libcudacxx/include/cuda/std/__numeric/adjacent_difference.h b/libcudacxx/include/cuda/std/__numeric/adjacent_difference.h
index caf801a8af..88a0ab53cc 100644
--- a/libcudacxx/include/cuda/std/__numeric/adjacent_difference.h
+++ b/libcudacxx/include/cuda/std/__numeric/adjacent_difference.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 adjacent_difference(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
   if (__first != __last)
@@ -46,7 +46,7 @@ adjacent_difference(_InputIterator __first, _InputIterator __last, _OutputIterat
 }
 
 template <class _InputIterator, class _OutputIterator, class _BinaryOperation>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator adjacent_difference(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator adjacent_difference(
   _InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryOperation __binary_op)
 {
   if (__first != __last)
diff --git a/libcudacxx/include/cuda/std/__numeric/exclusive_scan.h b/libcudacxx/include/cuda/std/__numeric/exclusive_scan.h
index 73392bff48..2c5c442e82 100644
--- a/libcudacxx/include/cuda/std/__numeric/exclusive_scan.h
+++ b/libcudacxx/include/cuda/std/__numeric/exclusive_scan.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _Tp, class _BinaryOp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 exclusive_scan(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _Tp __init, _BinaryOp __b)
 {
   if (__first != __last)
@@ -51,7 +51,7 @@ exclusive_scan(_InputIterator __first, _InputIterator __last, _OutputIterator __
 }
 
 template <class _InputIterator, class _OutputIterator, class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 exclusive_scan(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _Tp __init)
 {
   return _CUDA_VSTD::exclusive_scan(__first, __last, __result, __init, _CUDA_VSTD::plus<>());
diff --git a/libcudacxx/include/cuda/std/__numeric/gcd_lcm.h b/libcudacxx/include/cuda/std/__numeric/gcd_lcm.h
index 1e964f853b..9212d1d069 100644
--- a/libcudacxx/include/cuda/std/__numeric/gcd_lcm.h
+++ b/libcudacxx/include/cuda/std/__numeric/gcd_lcm.h
@@ -39,7 +39,7 @@ struct __ct_abs;
 template <typename _Result, typename _Source>
 struct __ct_abs<_Result, _Source, true>
 {
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Result operator()(_Source __t) const noexcept
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Result operator()(_Source __t) const noexcept
   {
     if (__t >= 0)
     {
@@ -59,21 +59,21 @@ struct __ct_abs<_Result, _Source, true>
 template <typename _Result, typename _Source>
 struct __ct_abs<_Result, _Source, false>
 {
-  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Result operator()(_Source __t) const noexcept
+  _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Result operator()(_Source __t) const noexcept
   {
     return __t;
   }
 };
 
 template <class _Tp>
-_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY _Tp __gcd(_Tp __m, _Tp __n)
+_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI _Tp __gcd(_Tp __m, _Tp __n)
 {
   static_assert((!_CCCL_TRAIT(is_signed, _Tp)), "");
   return __n == 0 ? __m : _CUDA_VSTD::__gcd<_Tp>(__n, __m % __n);
 }
 
 template <class _Tp, class _Up>
-_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY __common_type_t<_Tp, _Up> gcd(_Tp __m, _Up __n)
+_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI __common_type_t<_Tp, _Up> gcd(_Tp __m, _Up __n)
 {
   static_assert((_CCCL_TRAIT(is_integral, _Tp) && _CCCL_TRAIT(is_integral, _Up)),
                 "Arguments to gcd must be integer types");
@@ -86,7 +86,7 @@ _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY __common_type_t<_Tp, _Up> gc
 }
 
 template <class _Tp, class _Up>
-_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY __common_type_t<_Tp, _Up> lcm(_Tp __m, _Up __n)
+_CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_HIDE_FROM_ABI __common_type_t<_Tp, _Up> lcm(_Tp __m, _Up __n)
 {
   static_assert((_CCCL_TRAIT(is_integral, _Tp) && _CCCL_TRAIT(is_integral, _Up)),
                 "Arguments to lcm must be integer types");
diff --git a/libcudacxx/include/cuda/std/__numeric/inclusive_scan.h b/libcudacxx/include/cuda/std/__numeric/inclusive_scan.h
index bbf5b678a0..485f3d2729 100644
--- a/libcudacxx/include/cuda/std/__numeric/inclusive_scan.h
+++ b/libcudacxx/include/cuda/std/__numeric/inclusive_scan.h
@@ -29,7 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _Tp, class _BinaryOp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 inclusive_scan(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryOp __b, _Tp __init)
 {
   for (; __first != __last; ++__first, (void) ++__result)
@@ -41,7 +41,7 @@ inclusive_scan(_InputIterator __first, _InputIterator __last, _OutputIterator __
 }
 
 template <class _InputIterator, class _OutputIterator, class _BinaryOp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 inclusive_scan(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryOp __b)
 {
   if (__first != __last)
@@ -58,7 +58,7 @@ inclusive_scan(_InputIterator __first, _InputIterator __last, _OutputIterator __
 }
 
 template <class _InputIterator, class _OutputIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 inclusive_scan(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
   return _CUDA_VSTD::inclusive_scan(__first, __last, __result, _CUDA_VSTD::plus<>());
diff --git a/libcudacxx/include/cuda/std/__numeric/inner_product.h b/libcudacxx/include/cuda/std/__numeric/inner_product.h
index a968294cb3..1212d5309b 100644
--- a/libcudacxx/include/cuda/std/__numeric/inner_product.h
+++ b/libcudacxx/include/cuda/std/__numeric/inner_product.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator1, class _InputIterator2, class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp
 inner_product(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _Tp __init)
 {
   for (; __first1 != __last1; ++__first1, (void) ++__first2)
@@ -38,7 +38,7 @@ inner_product(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2
 }
 
 template <class _InputIterator1, class _InputIterator2, class _Tp, class _BinaryOperation1, class _BinaryOperation2>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp inner_product(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp inner_product(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
diff --git a/libcudacxx/include/cuda/std/__numeric/iota.h b/libcudacxx/include/cuda/std/__numeric/iota.h
index 3fcab42048..ff9d8aea70 100644
--- a/libcudacxx/include/cuda/std/__numeric/iota.h
+++ b/libcudacxx/include/cuda/std/__numeric/iota.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _ForwardIterator, class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
 iota(_ForwardIterator __first, _ForwardIterator __last, _Tp __value_)
 {
   for (; __first != __last; ++__first, (void) ++__value_)
diff --git a/libcudacxx/include/cuda/std/__numeric/midpoint.h b/libcudacxx/include/cuda/std/__numeric/midpoint.h
index 30db6b2dbd..b3f707fefe 100644
--- a/libcudacxx/include/cuda/std/__numeric/midpoint.h
+++ b/libcudacxx/include/cuda/std/__numeric/midpoint.h
@@ -38,7 +38,7 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
 __enable_if_t<_CCCL_TRAIT(is_integral, _Tp) && !_CCCL_TRAIT(is_same, bool, _Tp) && !_CCCL_TRAIT(is_null_pointer, _Tp),
               _Tp>
 midpoint(_Tp __a, _Tp __b) noexcept
@@ -59,26 +59,25 @@ midpoint(_Tp __a, _Tp __b) noexcept
 
 template <class _Tp,
           __enable_if_t<_CCCL_TRAIT(is_object, _Tp) && !_CCCL_TRAIT(is_void, _Tp) && (sizeof(_Tp) > 0), int> = 0>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp* midpoint(_Tp* __a, _Tp* __b) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp* midpoint(_Tp* __a, _Tp* __b) noexcept
 {
   return __a + _CUDA_VSTD::midpoint(ptrdiff_t(0), __b - __a);
 }
 
 template <typename _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 int __sign(_Tp __val)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 int __sign(_Tp __val)
 {
   return (_Tp(0) < __val) - (__val < _Tp(0));
 }
 
 template <typename _Fp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Fp __fp_abs(_Fp __f)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Fp __fp_abs(_Fp __f)
 {
   return __f >= 0 ? __f : -__f;
 }
 
 template <class _Fp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY
-_CCCL_CONSTEXPR_CXX14 __enable_if_t<_CCCL_TRAIT(is_floating_point, _Fp), _Fp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __enable_if_t<_CCCL_TRAIT(is_floating_point, _Fp), _Fp>
 midpoint(_Fp __a, _Fp __b) noexcept
 {
   _CCCL_CONSTEXPR_CXX14 _Fp __lo = numeric_limits<_Fp>::min() * 2;
diff --git a/libcudacxx/include/cuda/std/__numeric/partial_sum.h b/libcudacxx/include/cuda/std/__numeric/partial_sum.h
index 8cd0dfbe06..547db63a8e 100644
--- a/libcudacxx/include/cuda/std/__numeric/partial_sum.h
+++ b/libcudacxx/include/cuda/std/__numeric/partial_sum.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __result)
 {
   if (__first != __last)
@@ -45,7 +45,7 @@ partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __res
 }
 
 template <class _InputIterator, class _OutputIterator, class _BinaryOperation>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator
 partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryOperation __binary_op)
 {
   if (__first != __last)
diff --git a/libcudacxx/include/cuda/std/__numeric/reduce.h b/libcudacxx/include/cuda/std/__numeric/reduce.h
index a95b54405a..24f863bd3c 100644
--- a/libcudacxx/include/cuda/std/__numeric/reduce.h
+++ b/libcudacxx/include/cuda/std/__numeric/reduce.h
@@ -29,7 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Tp, class _BinaryOp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp
 reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOp __b)
 {
   for (; __first != __last; ++__first)
@@ -40,14 +40,14 @@ reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOp __b)
 }
 
 template <class _InputIterator, class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp
 reduce(_InputIterator __first, _InputIterator __last, _Tp __init)
 {
   return _CUDA_VSTD::reduce(__first, __last, __init, _CUDA_VSTD::plus<>());
 }
 
 template <class _InputIterator>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 typename iterator_traits<_InputIterator>::value_type
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename iterator_traits<_InputIterator>::value_type
 reduce(_InputIterator __first, _InputIterator __last)
 {
   return _CUDA_VSTD::reduce(__first, __last, typename iterator_traits<_InputIterator>::value_type{});
diff --git a/libcudacxx/include/cuda/std/__numeric/transform_exclusive_scan.h b/libcudacxx/include/cuda/std/__numeric/transform_exclusive_scan.h
index 427ea9691c..f2f3c33c51 100644
--- a/libcudacxx/include/cuda/std/__numeric/transform_exclusive_scan.h
+++ b/libcudacxx/include/cuda/std/__numeric/transform_exclusive_scan.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _Tp, class _BinaryOp, class _UnaryOp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator transform_exclusive_scan(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator transform_exclusive_scan(
   _InputIterator __first, _InputIterator __last, _OutputIterator __result, _Tp __init, _BinaryOp __b, _UnaryOp __u)
 {
   if (__first != __last)
diff --git a/libcudacxx/include/cuda/std/__numeric/transform_inclusive_scan.h b/libcudacxx/include/cuda/std/__numeric/transform_inclusive_scan.h
index 165285d76b..f9dc16f35d 100644
--- a/libcudacxx/include/cuda/std/__numeric/transform_inclusive_scan.h
+++ b/libcudacxx/include/cuda/std/__numeric/transform_inclusive_scan.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _OutputIterator, class _Tp, class _BinaryOp, class _UnaryOp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator transform_inclusive_scan(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator transform_inclusive_scan(
   _InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryOp __b, _UnaryOp __u, _Tp __init)
 {
   for (; __first != __last; ++__first, (void) ++__result)
@@ -40,7 +40,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator transform_in
 }
 
 template <class _InputIterator, class _OutputIterator, class _BinaryOp, class _UnaryOp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _OutputIterator transform_inclusive_scan(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _OutputIterator transform_inclusive_scan(
   _InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryOp __b, _UnaryOp __u)
 {
   if (__first != __last)
diff --git a/libcudacxx/include/cuda/std/__numeric/transform_reduce.h b/libcudacxx/include/cuda/std/__numeric/transform_reduce.h
index d90019aae0..bbeaade434 100644
--- a/libcudacxx/include/cuda/std/__numeric/transform_reduce.h
+++ b/libcudacxx/include/cuda/std/__numeric/transform_reduce.h
@@ -29,7 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Tp, class _BinaryOp, class _UnaryOp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp
 transform_reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOp __b, _UnaryOp __u)
 {
   for (; __first != __last; ++__first)
@@ -40,7 +40,7 @@ transform_reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _Bin
 }
 
 template <class _InputIterator1, class _InputIterator2, class _Tp, class _BinaryOp1, class _BinaryOp2>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp transform_reduce(
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp transform_reduce(
   _InputIterator1 __first1,
   _InputIterator1 __last1,
   _InputIterator2 __first2,
@@ -56,7 +56,7 @@ _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp transfor
 }
 
 template <class _InputIterator1, class _InputIterator2, class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp
 transform_reduce(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _Tp __init)
 {
   return _CUDA_VSTD::transform_reduce(
diff --git a/libcudacxx/include/cuda/std/__ranges/access.h b/libcudacxx/include/cuda/std/__ranges/access.h
index a2bc2b4133..eae85751f4 100644
--- a/libcudacxx/include/cuda/std/__ranges/access.h
+++ b/libcudacxx/include/cuda/std/__ranges/access.h
@@ -86,8 +86,7 @@ struct __fn
 #  if (!defined(_CCCL_COMPILER_GCC) || __GNUC__ >= 11)
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES((sizeof(_Tp) >= 0)) // Disallow incomplete element types.
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
-  operator()(_Tp (&__t)[]) const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp (&__t)[]) const noexcept
   {
     return __t + 0;
   }
@@ -95,23 +94,24 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Tp, size_t _Np)
   _LIBCUDACXX_REQUIRES((sizeof(_Tp) >= 0)) // Disallow incomplete element types.
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
-  operator()(_Tp (&__t)[_Np]) const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp (&__t)[_Np]) const noexcept
   {
     return __t + 0;
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__member_begin<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(__t.begin())))
   {
     return _LIBCUDACXX_AUTO_CAST(__t.begin());
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__unqualified_begin<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(begin(__t))))
   {
     return _LIBCUDACXX_AUTO_CAST(begin(__t));
@@ -183,23 +183,24 @@ struct __fn
 {
   _LIBCUDACXX_TEMPLATE(class _Tp, size_t _Np)
   _LIBCUDACXX_REQUIRES((sizeof(_Tp) >= 0)) // Disallow incomplete element types.
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
-  operator()(_Tp (&__t)[_Np]) const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp (&__t)[_Np]) const noexcept
   {
     return __t + _Np;
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__member_end<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(__t.end())))
   {
     return _LIBCUDACXX_AUTO_CAST(__t.end());
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__unqualified_end<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(end(__t))))
   {
     return _LIBCUDACXX_AUTO_CAST(end(__t));
@@ -221,18 +222,20 @@ _CCCL_GLOBAL_CONSTANT auto end = __end::__fn{};
 _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__cbegin)
 struct __fn
 {
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_lvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::begin(static_cast<const remove_reference_t<_Tp>&>(__t))))
       -> decltype(_CUDA_VRANGES::begin(static_cast<const remove_reference_t<_Tp>&>(__t)))
   {
     return _CUDA_VRANGES::begin(static_cast<const remove_reference_t<_Tp>&>(__t));
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_rvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::begin(static_cast<const _Tp&&>(__t))))
       -> decltype(_CUDA_VRANGES::begin(static_cast<const _Tp&&>(__t)))
   {
@@ -251,20 +254,21 @@ _CCCL_GLOBAL_CONSTANT auto cbegin = __cbegin::__fn{};
 _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__cend)
 struct __fn
 {
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_lvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::end(static_cast<const remove_reference_t<_Tp>&>(__t))))
       -> decltype(_CUDA_VRANGES::end(static_cast<const remove_reference_t<_Tp>&>(__t)))
   {
     return _CUDA_VRANGES::end(static_cast<const remove_reference_t<_Tp>&>(__t));
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_rvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
-    noexcept(noexcept(_CUDA_VRANGES::end(static_cast<const _Tp&&>(__t))))
-      -> decltype(_CUDA_VRANGES::end(static_cast<const _Tp&&>(__t)))
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const noexcept(noexcept(
+    _CUDA_VRANGES::end(static_cast<const _Tp&&>(__t)))) -> decltype(_CUDA_VRANGES::end(static_cast<const _Tp&&>(__t)))
   {
     return _CUDA_VRANGES::end(static_cast<const _Tp&&>(__t));
   }
diff --git a/libcudacxx/include/cuda/std/__ranges/concepts.h b/libcudacxx/include/cuda/std/__ranges/concepts.h
index 04769a4e1c..49163cd99b 100644
--- a/libcudacxx/include/cuda/std/__ranges/concepts.h
+++ b/libcudacxx/include/cuda/std/__ranges/concepts.h
@@ -21,6 +21,7 @@
 #endif // no system header
 
 #include <cuda/std/__concepts/constructible.h>
+#include <cuda/std/__concepts/convertible_to.h>
 #include <cuda/std/__concepts/movable.h>
 #include <cuda/std/__concepts/same_as.h>
 #include <cuda/std/__iterator/concepts.h>
@@ -288,6 +289,20 @@ _LIBCUDACXX_CONCEPT viewable_range = _LIBCUDACXX_FRAGMENT(__viewable_range_, _Tp
 
 #  endif // _CCCL_STD_VER >= 2017
 
+//[container.intro.reqmts]
+#  if _CCCL_STD_VER >= 2020
+template <class _Range, class _Tp>
+concept __container_compatible_range = input_range<_Range> && convertible_to<range_reference_t<_Range>, _Tp>;
+#  else // ^^^ C++20 ^^^ / vvv C++17 vvv
+template <class _Range, class _Tp>
+_LIBCUDACXX_CONCEPT_FRAGMENT(
+  __container_compatible_range_,
+  requires()(requires(input_range<_Range>), requires(convertible_to<range_reference_t<_Range>, _Tp>)));
+
+template <class _Range, class _Tp>
+_LIBCUDACXX_CONCEPT __container_compatible_range = _LIBCUDACXX_FRAGMENT(__container_compatible_range_, _Range, _Tp);
+#  endif // _CCCL_STD_VER <= 2017
+
 #endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
diff --git a/libcudacxx/include/cuda/std/__ranges/dangling.h b/libcudacxx/include/cuda/std/__ranges/dangling.h
index cd3189f0a1..e0974298c0 100644
--- a/libcudacxx/include/cuda/std/__ranges/dangling.h
+++ b/libcudacxx/include/cuda/std/__ranges/dangling.h
@@ -31,9 +31,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
 struct dangling
 {
-  dangling() = default;
+  _CCCL_HIDE_FROM_ABI dangling() = default;
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr dangling(_Args&&...) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr dangling(_Args&&...) noexcept
   {}
 };
 
diff --git a/libcudacxx/include/cuda/std/__ranges/data.h b/libcudacxx/include/cuda/std/__ranges/data.h
index 14d3473f3c..d28da2031a 100644
--- a/libcudacxx/include/cuda/std/__ranges/data.h
+++ b/libcudacxx/include/cuda/std/__ranges/data.h
@@ -77,17 +77,18 @@ _LIBCUDACXX_CONCEPT __ranges_begin_invocable = _LIBCUDACXX_FRAGMENT(__ranges_beg
 
 struct __fn
 {
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__member_data<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
-    noexcept(noexcept(__t.data()))
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const noexcept(noexcept(__t.data()))
   {
     return __t.data();
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__ranges_begin_invocable<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VSTD::to_address(_CUDA_VRANGES::begin(__t))))
   {
     return _CUDA_VSTD::to_address(_CUDA_VRANGES::begin(__t));
@@ -107,7 +108,7 @@ struct __fn
 {
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_lvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::data(static_cast<const remove_reference_t<_Tp>&>(__t))))
       -> decltype(_CUDA_VRANGES::data(static_cast<const remove_reference_t<_Tp>&>(__t)))
   {
@@ -116,9 +117,8 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_rvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
-    noexcept(noexcept(_CUDA_VRANGES::data(static_cast<const _Tp&&>(__t))))
-      -> decltype(_CUDA_VRANGES::data(static_cast<const _Tp&&>(__t)))
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const noexcept(noexcept(
+    _CUDA_VRANGES::data(static_cast<const _Tp&&>(__t)))) -> decltype(_CUDA_VRANGES::data(static_cast<const _Tp&&>(__t)))
   {
     return _CUDA_VRANGES::data(static_cast<const _Tp&&>(__t));
   }
diff --git a/libcudacxx/include/cuda/std/__ranges/empty.h b/libcudacxx/include/cuda/std/__ranges/empty.h
index 581faa8bbd..8b41ac1dee 100644
--- a/libcudacxx/include/cuda/std/__ranges/empty.h
+++ b/libcudacxx/include/cuda/std/__ranges/empty.h
@@ -76,7 +76,7 @@ struct __fn
 {
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__member_empty<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Tp&& __t) const
     noexcept(noexcept(bool(__t.empty())))
   {
     return bool(__t.empty());
@@ -84,7 +84,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__can_invoke_size<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::size(__t)))
   {
     return _CUDA_VRANGES::size(__t) == 0;
@@ -92,7 +92,7 @@ struct __fn
 
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__can_compare_begin_end<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_Tp&& __t) const
     noexcept(noexcept(bool(_CUDA_VRANGES::begin(__t) == _CUDA_VRANGES::end(__t))))
   {
     return _CUDA_VRANGES::begin(__t) == _CUDA_VRANGES::end(__t);
diff --git a/libcudacxx/include/cuda/std/__ranges/enable_view.h b/libcudacxx/include/cuda/std/__ranges/enable_view.h
index a8ae779717..a4d67623b7 100644
--- a/libcudacxx/include/cuda/std/__ranges/enable_view.h
+++ b/libcudacxx/include/cuda/std/__ranges/enable_view.h
@@ -54,7 +54,7 @@ _LIBCUDACXX_END_NAMESPACE_RANGES_ABI
 
 _LIBCUDACXX_TEMPLATE(class _Op, class _Yp)
 _LIBCUDACXX_REQUIRES(is_convertible_v<_Op*, view_interface<_Yp>*>)
-_LIBCUDACXX_INLINE_VISIBILITY void __is_derived_from_view_interface(const _Op*, const view_interface<_Yp>*);
+_LIBCUDACXX_HIDE_FROM_ABI void __is_derived_from_view_interface(const _Op*, const view_interface<_Yp>*);
 
 #  if _CCCL_STD_VER >= 2020
 
diff --git a/libcudacxx/include/cuda/std/__ranges/from_range.h b/libcudacxx/include/cuda/std/__ranges/from_range.h
new file mode 100644
index 0000000000..f42053aeac
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__ranges/from_range.h
@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___RANGES_FROM_RANGE_H
+#define _LIBCUDACXX___RANGES_FROM_RANGE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+struct from_range_t
+{};
+
+_CCCL_GLOBAL_CONSTANT from_range_t from_range{};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___RANGES_FROM_RANGE_H
diff --git a/libcudacxx/include/cuda/std/__ranges/rbegin.h b/libcudacxx/include/cuda/std/__ranges/rbegin.h
index 8fe23be3b2..98d92deda2 100644
--- a/libcudacxx/include/cuda/std/__ranges/rbegin.h
+++ b/libcudacxx/include/cuda/std/__ranges/rbegin.h
@@ -98,25 +98,28 @@ _LIBCUDACXX_CONCEPT __can_reverse = _LIBCUDACXX_FRAGMENT(__can_reverse_, _Tp);
 
 struct __fn
 {
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__member_rbegin<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(__t.rbegin())))
   {
     return _LIBCUDACXX_AUTO_CAST(__t.rbegin());
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__unqualified_rbegin<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(rbegin(__t))))
   {
     return _LIBCUDACXX_AUTO_CAST(rbegin(__t));
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__can_reverse<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::end(__t)))
   {
     return _CUDA_VSTD::make_reverse_iterator(_CUDA_VRANGES::end(__t));
@@ -138,18 +141,20 @@ _CCCL_GLOBAL_CONSTANT auto rbegin = __rbegin::__fn{};
 _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__crbegin)
 struct __fn
 {
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_lvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::rbegin(static_cast<const remove_reference_t<_Tp>&>(__t))))
       -> decltype(_CUDA_VRANGES::rbegin(static_cast<const remove_reference_t<_Tp>&>(__t)))
   {
     return _CUDA_VRANGES::rbegin(static_cast<const remove_reference_t<_Tp>&>(__t));
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_rvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::rbegin(static_cast<const _Tp&&>(__t))))
       -> decltype(_CUDA_VRANGES::rbegin(static_cast<const _Tp&&>(__t)))
   {
diff --git a/libcudacxx/include/cuda/std/__ranges/rend.h b/libcudacxx/include/cuda/std/__ranges/rend.h
index 458480b14d..429c701f34 100644
--- a/libcudacxx/include/cuda/std/__ranges/rend.h
+++ b/libcudacxx/include/cuda/std/__ranges/rend.h
@@ -105,25 +105,28 @@ _LIBCUDACXX_CONCEPT __can_reverse = _LIBCUDACXX_FRAGMENT(__can_reverse_, _Tp);
 class __fn
 {
 public:
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__member_rend<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(__t.rend())))
   {
     return _LIBCUDACXX_AUTO_CAST(__t.rend());
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__unqualified_rend<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(rend(__t))))
   {
     return _LIBCUDACXX_AUTO_CAST(rend(__t));
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__can_reverse<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::begin(__t)))
   {
     return _CUDA_VSTD::make_reverse_iterator(_CUDA_VRANGES::begin(__t));
@@ -145,20 +148,21 @@ _CCCL_GLOBAL_CONSTANT auto rend = __rend::__fn{};
 _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__crend)
 struct __fn
 {
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_lvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::rend(static_cast<const remove_reference_t<_Tp>&>(__t))))
       -> decltype(_CUDA_VRANGES::rend(static_cast<const remove_reference_t<_Tp>&>(__t)))
   {
     return _CUDA_VRANGES::rend(static_cast<const remove_reference_t<_Tp>&>(__t));
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(is_rvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
-    noexcept(noexcept(_CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t))))
-      -> decltype(_CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t)))
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const noexcept(noexcept(
+    _CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t)))) -> decltype(_CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t)))
   {
     return _CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t));
   }
diff --git a/libcudacxx/include/cuda/std/__ranges/size.h b/libcudacxx/include/cuda/std/__ranges/size.h
index 849e55440c..47e41d902b 100644
--- a/libcudacxx/include/cuda/std/__ranges/size.h
+++ b/libcudacxx/include/cuda/std/__ranges/size.h
@@ -33,7 +33,7 @@
 #include <cuda/std/__utility/auto_cast.h>
 #include <cuda/std/__utility/declval.h>
 #include <cuda/std/cstddef>
-#include <cuda/std/detail/libcxx/include/cstdlib>
+#include <cuda/std/cstdlib>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
@@ -111,42 +111,43 @@ struct __fn
 {
   // `[range.prim.size]`: the array case (for rvalues).
   template <class _Tp, size_t _Sz>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t
-  operator()(_Tp (&&)[_Sz]) const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t operator()(_Tp (&&)[_Sz]) const noexcept
   {
     return _Sz;
   }
 
   // `[range.prim.size]`: the array case (for lvalues).
   template <class _Tp, size_t _Sz>
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t
-  operator()(_Tp (&)[_Sz]) const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t operator()(_Tp (&)[_Sz]) const noexcept
   {
     return _Sz;
   }
 
   // `[range.prim.size]`: `auto(t.size())` is a valid expression.
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__member_size<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(__t.size())))
   {
     return _LIBCUDACXX_AUTO_CAST(__t.size());
   }
 
   // `[range.prim.size]`: `auto(size(t))` is a valid expression.
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__unqualified_size<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_LIBCUDACXX_AUTO_CAST(size(__t))))
   {
     return _LIBCUDACXX_AUTO_CAST(size(__t));
   }
 
   // [range.prim.size]: the `to-unsigned-like` case.
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__difference<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VSTD::__to_unsigned_like(_CUDA_VRANGES::end(__t) - _CUDA_VRANGES::begin(__t))))
       -> decltype(_CUDA_VSTD::__to_unsigned_like(_CUDA_VRANGES::end(__t) - _CUDA_VRANGES::begin(__t)))
   {
@@ -179,7 +180,7 @@ struct __fn
 {
   _LIBCUDACXX_TEMPLATE(class _Tp)
   _LIBCUDACXX_REQUIRES(__can_ssize<_Tp>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto operator()(_Tp&& __t) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
     noexcept(noexcept(_CUDA_VRANGES::size(__t)))
   {
     using _Signed = make_signed_t<decltype(_CUDA_VRANGES::size(__t))>;
diff --git a/libcudacxx/include/cuda/std/__ranges/subrange.h b/libcudacxx/include/cuda/std/__ranges/subrange.h
index cc6e193c8b..50abc34f20 100644
--- a/libcudacxx/include/cuda/std/__ranges/subrange.h
+++ b/libcudacxx/include/cuda/std/__ranges/subrange.h
@@ -50,8 +50,8 @@
 #include <cuda/std/__type_traits/remove_const.h>
 #include <cuda/std/__type_traits/remove_pointer.h>
 #include <cuda/std/__utility/move.h>
+#include <cuda/std/cstdlib>
 #include <cuda/std/detail/libcxx/include/__assert>
-#include <cuda/std/detail/libcxx/include/cstdlib>
 
 #if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
 
@@ -229,7 +229,7 @@ template <class _Iter,
           enable_if_t<sentinel_for<_Sent, _Iter>, int>,
           enable_if_t<(_Kind == subrange_kind::sized || !sized_sentinel_for<_Sent, _Iter>), int>>
 #  endif // _CCCL_STD_VER <= 2017
-class _LIBCUDACXX_TEMPLATE_VIS subrange : public view_interface<subrange<_Iter, _Sent, _Kind>>
+class _CCCL_TYPE_VISIBILITY_DEFAULT subrange : public view_interface<subrange<_Iter, _Sent, _Kind>>
 {
 public:
   // Note: this is an internal implementation detail that is public only for internal usage.
@@ -239,7 +239,7 @@ class _LIBCUDACXX_TEMPLATE_VIS subrange : public view_interface<subrange<_Iter,
   struct _Empty
   {
     template <class _Tp>
-    _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Empty(_Tp) noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr _Empty(_Tp) noexcept
     {}
   };
   using _Size                            = conditional_t<_StoreSize, make_unsigned_t<iter_difference_t<_Iter>>, _Empty>;
@@ -254,14 +254,13 @@ class _LIBCUDACXX_TEMPLATE_VIS subrange : public view_interface<subrange<_Iter,
   = default;
 #  else // ^^^ C++20 ^^^ / vvv C++17 vvv
   template <class _It = _Iter, enable_if_t<default_initializable<_It>, int> = 0>
-  _LIBCUDACXX_HIDE_FROM_ABI
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange() noexcept(is_nothrow_default_constructible_v<_It>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange() noexcept(is_nothrow_default_constructible_v<_It>)
       : view_interface<subrange<_Iter, _Sent, _Kind>>(){};
 #  endif // _CCCL_STD_VER <= 2017
 
   _LIBCUDACXX_TEMPLATE(class _It)
   _LIBCUDACXX_REQUIRES(__subrange_from_iter_sent<_Iter, _It, _StoreSize>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange(_It __iter, _Sent __sent)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange(_It __iter, _Sent __sent)
       : view_interface<subrange<_Iter, _Sent, _Kind>>()
       , __begin_(_CUDA_VSTD::move(__iter))
       , __end_(_CUDA_VSTD::move(__sent))
@@ -269,8 +268,7 @@ class _LIBCUDACXX_TEMPLATE_VIS subrange : public view_interface<subrange<_Iter,
 
   _LIBCUDACXX_TEMPLATE(class _It)
   _LIBCUDACXX_REQUIRES(__subrange_from_iter_sent_size<_Iter, _Kind, _It>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange(
-    _It __iter, _Sent __sent, make_unsigned_t<iter_difference_t<_Iter>> __n)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange(_It __iter, _Sent __sent, make_unsigned_t<iter_difference_t<_Iter>> __n)
       : view_interface<subrange<_Iter, _Sent, _Kind>>()
       , __begin_(_CUDA_VSTD::move(__iter))
       , __end_(_CUDA_VSTD::move(__sent))
@@ -285,27 +283,26 @@ class _LIBCUDACXX_TEMPLATE_VIS subrange : public view_interface<subrange<_Iter,
 
   _LIBCUDACXX_TEMPLATE(class _Range)
   _LIBCUDACXX_REQUIRES(__subrange_from_range<_Iter, _Sent, _Kind, _Range, !_StoreSize>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange(_Range&& __range)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange(_Range&& __range)
       : subrange(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::end(__range))
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Range)
   _LIBCUDACXX_REQUIRES(__subrange_from_range<_Iter, _Sent, _Kind, _Range, _StoreSize>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange(_Range&& __range)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange(_Range&& __range)
       : subrange(__range, _CUDA_VRANGES::size(__range))
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Range)
   _LIBCUDACXX_REQUIRES(__subrange_from_range_size<_Iter, _Sent, _Kind, _Range>)
-  _LIBCUDACXX_HIDE_FROM_ABI
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange(_Range&& __range, make_unsigned_t<iter_difference_t<_Iter>> __n)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange(_Range&& __range, make_unsigned_t<iter_difference_t<_Iter>> __n)
       : subrange(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::end(__range), __n)
   {}
 
 #  if (!defined(_CCCL_COMPILER_GCC) || _GNUC_VER >= 900)
   _LIBCUDACXX_TEMPLATE(class _Pair)
   _LIBCUDACXX_REQUIRES(__pair_like<_Pair> _LIBCUDACXX_AND __subrange_to_pair<_Iter, _Sent, _Kind, _Pair>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr operator _Pair() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _Pair() const
   {
     return _Pair(__begin_, __end_);
   }
@@ -313,32 +310,31 @@ class _LIBCUDACXX_TEMPLATE_VIS subrange : public view_interface<subrange<_Iter,
 
   _LIBCUDACXX_TEMPLATE(class _It = _Iter)
   _LIBCUDACXX_REQUIRES(copyable<_It>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _It begin() const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _It begin() const
   {
     return __begin_;
   }
 
   _LIBCUDACXX_TEMPLATE(class _It = _Iter)
   _LIBCUDACXX_REQUIRES((!copyable<_It>) )
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _It begin()
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _It begin()
   {
     return _CUDA_VSTD::move(__begin_);
   }
 
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Sent end() const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Sent end() const
   {
     return __end_;
   }
 
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool empty() const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const
   {
     return __begin_ == __end_;
   }
 
   _LIBCUDACXX_TEMPLATE(subrange_kind _Kind_ = _Kind)
   _LIBCUDACXX_REQUIRES((_Kind_ == subrange_kind::sized))
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr make_unsigned_t<iter_difference_t<_Iter>>
-  size() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr make_unsigned_t<iter_difference_t<_Iter>> size() const
   {
     if constexpr (_StoreSize)
     {
@@ -355,16 +351,14 @@ class _LIBCUDACXX_TEMPLATE_VIS subrange : public view_interface<subrange<_Iter,
 
   _LIBCUDACXX_TEMPLATE(class _It = _Iter)
   _LIBCUDACXX_REQUIRES(forward_iterator<_It>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange
-  next(iter_difference_t<_Iter> __n = 1) const&
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange next(iter_difference_t<_Iter> __n = 1) const&
   {
     auto __tmp = *this;
     __tmp.advance(__n);
     return __tmp;
   }
 
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange
-  next(iter_difference_t<_Iter> __n = 1) &&
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange next(iter_difference_t<_Iter> __n = 1) &&
   {
     advance(__n);
     return _CUDA_VSTD::move(*this);
@@ -372,15 +366,14 @@ class _LIBCUDACXX_TEMPLATE_VIS subrange : public view_interface<subrange<_Iter,
 
   _LIBCUDACXX_TEMPLATE(class _It = _Iter)
   _LIBCUDACXX_REQUIRES(bidirectional_iterator<_It>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange
-  prev(iter_difference_t<_Iter> __n = 1) const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange prev(iter_difference_t<_Iter> __n = 1) const
   {
     auto __tmp = *this;
     __tmp.advance(-__n);
     return __tmp;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr subrange& advance(iter_difference_t<_Iter> __n)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange& advance(iter_difference_t<_Iter> __n)
   {
     if constexpr (bidirectional_iterator<_Iter>)
     {
@@ -441,8 +434,7 @@ template <size_t _Index,
           subrange_kind _Kind,
           enable_if_t<((_Index == 0) && copyable<_Iter>) || (_Index == 1), int>>
 #  endif // _CCCL_STD_VER <= 2017
-_LIBCUDACXX_HIDE_FROM_ABI
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto get(const subrange<_Iter, _Sent, _Kind>& __subrange)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto get(const subrange<_Iter, _Sent, _Kind>& __subrange)
 {
   if constexpr (_Index == 0)
   {
@@ -466,7 +458,7 @@ template <
   subrange_kind _Kind,
   enable_if_t<_Index<2, int>>
 #  endif // _CCCL_STD_VER <= 2017
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto get(subrange<_Iter, _Sent, _Kind>&& __subrange)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto get(subrange<_Iter, _Sent, _Kind>&& __subrange)
 {
   if constexpr (_Index == 0)
   {
diff --git a/libcudacxx/include/cuda/std/__ranges/unwrap_end.h b/libcudacxx/include/cuda/std/__ranges/unwrap_end.h
new file mode 100644
index 0000000000..c6d3294888
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__ranges/unwrap_end.h
@@ -0,0 +1,53 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___RANGES_UNWRAP_SENTINEL_H
+#define _LIBCUDACXX___RANGES_UNWRAP_SENTINEL_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__iterator/advance.h>
+#include <cuda/std/__ranges/access.h>
+#include <cuda/std/__ranges/concepts.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_RANGES
+
+#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+
+_LIBCUDACXX_TEMPLATE(class _Range)
+_LIBCUDACXX_REQUIRES(forward_range<_Range>)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator_t<_Range> __unwrap_end(_Range& __range)
+{
+  if constexpr (common_range<_Range>)
+  {
+    return _CUDA_VRANGES::end(__range);
+  }
+  else
+  {
+    auto __ret = _CUDA_VRANGES::begin(__range);
+    _CUDA_VRANGES::advance(__ret, _CUDA_VRANGES::end(__range));
+    return __ret;
+  }
+  _LIBCUDACXX_UNREACHABLE();
+}
+
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+
+_LIBCUDACXX_END_NAMESPACE_RANGES
+
+#endif // _LIBCUDACXX___RANGES_UNWRAP_SENTINEL_H
diff --git a/libcudacxx/include/cuda/std/__ranges/view_interface.h b/libcudacxx/include/cuda/std/__ranges/view_interface.h
index 39d0c72823..afcec90ca4 100644
--- a/libcudacxx/include/cuda/std/__ranges/view_interface.h
+++ b/libcudacxx/include/cuda/std/__ranges/view_interface.h
@@ -61,13 +61,13 @@ template <class _Derived, enable_if_t<is_class_v<_Derived> && same_as<_Derived,
 #  endif //  _CCCL_STD_VER <= 2017
 class view_interface
 {
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Derived& __derived() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Derived& __derived() noexcept
   {
     static_assert(sizeof(_Derived) && derived_from<_Derived, view_interface> && view<_Derived>, "");
     return static_cast<_Derived&>(*this);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr _Derived const& __derived() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Derived const& __derived() const noexcept
   {
     static_assert(sizeof(_Derived) && derived_from<_Derived, view_interface> && view<_Derived>, "");
     return static_cast<_Derived const&>(*this);
@@ -76,49 +76,49 @@ class view_interface
 public:
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(forward_range<_D2>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool empty()
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty()
   {
     return _CUDA_VRANGES::begin(__derived()) == _CUDA_VRANGES::end(__derived());
   }
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(forward_range<const _D2>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool empty() const
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const
   {
     return _CUDA_VRANGES::begin(__derived()) == _CUDA_VRANGES::end(__derived());
   }
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(__can_empty<_D2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit operator bool()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit operator bool()
   {
     return !_CUDA_VRANGES::empty(__derived());
   }
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(__can_empty<const _D2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit operator bool() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit operator bool() const
   {
     return !_CUDA_VRANGES::empty(__derived());
   }
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(contiguous_iterator<iterator_t<_D2>>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto data()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto data()
   {
     return _CUDA_VSTD::to_address(_CUDA_VRANGES::begin(__derived()));
   }
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(range<const _D2> _LIBCUDACXX_AND contiguous_iterator<iterator_t<const _D2>>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto data() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto data() const
   {
     return _CUDA_VSTD::to_address(_CUDA_VRANGES::begin(__derived()));
   }
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(forward_range<_D2> _LIBCUDACXX_AND sized_sentinel_for<sentinel_t<_D2>, iterator_t<_D2>>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto size()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto size()
   {
     return _CUDA_VSTD::__to_unsigned_like(_CUDA_VRANGES::end(__derived()) - _CUDA_VRANGES::begin(__derived()));
   }
@@ -126,14 +126,14 @@ class view_interface
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(
     forward_range<const _D2> _LIBCUDACXX_AND sized_sentinel_for<sentinel_t<const _D2>, iterator_t<const _D2>>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto size() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto size() const
   {
     return _CUDA_VSTD::__to_unsigned_like(_CUDA_VRANGES::end(__derived()) - _CUDA_VRANGES::begin(__derived()));
   }
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(forward_range<_D2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto) front()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) front()
   {
     _LIBCUDACXX_ASSERT(!empty(), "Precondition `!empty()` not satisfied. `.front()` called on an empty view.");
     return *_CUDA_VRANGES::begin(__derived());
@@ -141,7 +141,7 @@ class view_interface
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(forward_range<const _D2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto) front() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) front() const
   {
     _LIBCUDACXX_ASSERT(!empty(), "Precondition `!empty()` not satisfied. `.front()` called on an empty view.");
     return *_CUDA_VRANGES::begin(__derived());
@@ -149,7 +149,7 @@ class view_interface
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(bidirectional_range<_D2> _LIBCUDACXX_AND common_range<_D2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto) back()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) back()
   {
     _LIBCUDACXX_ASSERT(!empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
     return *_CUDA_VRANGES::prev(_CUDA_VRANGES::end(__derived()));
@@ -157,7 +157,7 @@ class view_interface
 
   _LIBCUDACXX_TEMPLATE(class _D2 = _Derived)
   _LIBCUDACXX_REQUIRES(bidirectional_range<const _D2> _LIBCUDACXX_AND common_range<const _D2>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto) back() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) back() const
   {
     _LIBCUDACXX_ASSERT(!empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
     return *_CUDA_VRANGES::prev(_CUDA_VRANGES::end(__derived()));
@@ -165,16 +165,14 @@ class view_interface
 
   _LIBCUDACXX_TEMPLATE(class _RARange = _Derived)
   _LIBCUDACXX_REQUIRES(random_access_range<_RARange>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto)
-  operator[](range_difference_t<_RARange> __index)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) operator[](range_difference_t<_RARange> __index)
   {
     return _CUDA_VRANGES::begin(__derived())[__index];
   }
 
   _LIBCUDACXX_TEMPLATE(class _RARange = const _Derived)
   _LIBCUDACXX_REQUIRES(random_access_range<_RARange>)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto)
-  operator[](range_difference_t<_RARange> __index) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) operator[](range_difference_t<_RARange> __index) const
   {
     return _CUDA_VRANGES::begin(__derived())[__index];
   }
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h b/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h
index 1d9702c651..581aceed4c 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/sfinae_helpers.h
@@ -116,7 +116,7 @@ struct __tuple_assignable<_Tp, _Up, true, true>
 {};
 
 template <size_t _Ip, class... _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element<_Ip, tuple<_Tp...>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, tuple<_Tp...>>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE __tuple_element_t<_Ip, __tuple_types<_Tp...>> type;
 };
@@ -133,7 +133,7 @@ template <class _Tuple, size_t _ExpectedSize, class _RawTuple = __remove_cvref_t
 using __tuple_like_with_size _LIBCUDACXX_NODEBUG_TYPE =
   __tuple_like_with_size_imp<__tuple_like_ext<_RawTuple>::value, tuple_size<_RawTuple>, _ExpectedSize>;
 
-struct _LIBCUDACXX_TYPE_VIS __check_tuple_constructor_fail
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __check_tuple_constructor_fail
 {
   template <int&...>
   using __enable_explicit_default = false_type;
@@ -160,11 +160,11 @@ struct __sfinae_copy_base
 template <>
 struct __sfinae_copy_base<false>
 {
-  __sfinae_copy_base()                                     = default;
-  __sfinae_copy_base(__sfinae_copy_base const&)            = delete;
-  __sfinae_copy_base(__sfinae_copy_base&&)                 = default;
-  __sfinae_copy_base& operator=(__sfinae_copy_base const&) = default;
-  __sfinae_copy_base& operator=(__sfinae_copy_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_copy_base()                                     = default;
+  __sfinae_copy_base(__sfinae_copy_base const&)                                = delete;
+  _CCCL_HIDE_FROM_ABI __sfinae_copy_base(__sfinae_copy_base&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_copy_base& operator=(__sfinae_copy_base const&) = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_copy_base& operator=(__sfinae_copy_base&&)      = default;
 };
 
 template <bool _CanCopy, bool _CanMove>
@@ -173,11 +173,11 @@ struct __sfinae_move_base : __sfinae_copy_base<_CanCopy>
 template <bool _CanCopy>
 struct __sfinae_move_base<_CanCopy, false> : __sfinae_copy_base<_CanCopy>
 {
-  __sfinae_move_base()                                     = default;
-  __sfinae_move_base(__sfinae_move_base const&)            = default;
-  __sfinae_move_base(__sfinae_move_base&&)                 = delete;
-  __sfinae_move_base& operator=(__sfinae_move_base const&) = default;
-  __sfinae_move_base& operator=(__sfinae_move_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_move_base()                                     = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_move_base(__sfinae_move_base const&)            = default;
+  __sfinae_move_base(__sfinae_move_base&&)                                     = delete;
+  _CCCL_HIDE_FROM_ABI __sfinae_move_base& operator=(__sfinae_move_base const&) = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_move_base& operator=(__sfinae_move_base&&)      = default;
 };
 
 template <bool _CanCopy, bool _CanMove, bool _CanCopyAssign>
@@ -186,11 +186,11 @@ struct __sfinae_copy_assign_base : __sfinae_move_base<_CanCopy, _CanMove>
 template <bool _CanCopy, bool _CanMove>
 struct __sfinae_copy_assign_base<_CanCopy, _CanMove, false> : __sfinae_move_base<_CanCopy, _CanMove>
 {
-  __sfinae_copy_assign_base()                                            = default;
-  __sfinae_copy_assign_base(__sfinae_copy_assign_base const&)            = default;
-  __sfinae_copy_assign_base(__sfinae_copy_assign_base&&)                 = default;
-  __sfinae_copy_assign_base& operator=(__sfinae_copy_assign_base const&) = delete;
-  __sfinae_copy_assign_base& operator=(__sfinae_copy_assign_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_copy_assign_base()                                       = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_copy_assign_base(__sfinae_copy_assign_base const&)       = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_copy_assign_base(__sfinae_copy_assign_base&&)            = default;
+  __sfinae_copy_assign_base& operator=(__sfinae_copy_assign_base const&)                = delete;
+  _CCCL_HIDE_FROM_ABI __sfinae_copy_assign_base& operator=(__sfinae_copy_assign_base&&) = default;
 };
 
 template <bool _CanCopy, bool _CanMove, bool _CanCopyAssign, bool _CanMoveAssign>
@@ -200,11 +200,11 @@ template <bool _CanCopy, bool _CanMove, bool _CanCopyAssign>
 struct __sfinae_move_assign_base<_CanCopy, _CanMove, _CanCopyAssign, false>
     : __sfinae_copy_assign_base<_CanCopy, _CanMove, _CanCopyAssign>
 {
-  __sfinae_move_assign_base()                                            = default;
-  __sfinae_move_assign_base(__sfinae_move_assign_base const&)            = default;
-  __sfinae_move_assign_base(__sfinae_move_assign_base&&)                 = default;
-  __sfinae_move_assign_base& operator=(__sfinae_move_assign_base const&) = default;
-  __sfinae_move_assign_base& operator=(__sfinae_move_assign_base&&)      = delete;
+  _CCCL_HIDE_FROM_ABI __sfinae_move_assign_base()                                            = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_move_assign_base(__sfinae_move_assign_base const&)            = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_move_assign_base(__sfinae_move_assign_base&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __sfinae_move_assign_base& operator=(__sfinae_move_assign_base const&) = default;
+  __sfinae_move_assign_base& operator=(__sfinae_move_assign_base&&)                          = delete;
 };
 
 template <bool _CanCopy, bool _CanMove, bool _CanCopyAssign, bool _CanMoveAssign>
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_element.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_element.h
index 657cb5bc7d..eff87328ea 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_element.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_element.h
@@ -30,25 +30,25 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <size_t _Ip, class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element;
 
 template <size_t _Ip, class... _Tp>
 using __tuple_element_t _LIBCUDACXX_NODEBUG_TYPE = typename tuple_element<_Ip, _Tp...>::type;
 
 template <size_t _Ip, class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element<_Ip, const _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, const _Tp>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE typename add_const<__tuple_element_t<_Ip, _Tp>>::type type;
 };
 
 template <size_t _Ip, class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element<_Ip, volatile _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, volatile _Tp>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE typename add_volatile<__tuple_element_t<_Ip, _Tp>>::type type;
 };
 
 template <size_t _Ip, class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element<_Ip, const volatile _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, const volatile _Tp>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE typename add_cv<__tuple_element_t<_Ip, _Tp>>::type type;
 };
@@ -105,7 +105,7 @@ struct __indexer<__tuple_types<_Types...>, __tuple_indices<_Idx...>> : __indexed
 {};
 
 template <size_t _Idx, class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __indexed<_Idx, _Tp> __at_index(__indexed<_Idx, _Tp> const&);
+_LIBCUDACXX_HIDE_FROM_ABI __indexed<_Idx, _Tp> __at_index(__indexed<_Idx, _Tp> const&);
 
 } // namespace __indexer_detail
 
@@ -115,7 +115,7 @@ using __type_pack_element _LIBCUDACXX_NODEBUG_TYPE = typename decltype(__indexer
 #endif
 
 template <size_t _Ip, class... _Types>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element<_Ip, __tuple_types<_Types...>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, __tuple_types<_Types...>>
 {
   static_assert(_Ip < sizeof...(_Types), "tuple_element index out of range");
   typedef _LIBCUDACXX_NODEBUG_TYPE __type_pack_element<_Ip, _Types...> type;
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_size.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_size.h
index 7fbf9b54e9..594becca09 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_size.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_size.h
@@ -31,13 +31,13 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_size;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_size;
 
 template <class _Tp, class...>
 using __enable_if_tuple_size_imp = _Tp;
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS
+struct _CCCL_TYPE_VISIBILITY_DEFAULT
 tuple_size<__enable_if_tuple_size_imp<const _Tp,
                                       __enable_if_t<!is_volatile<_Tp>::value>,
                                       integral_constant<size_t, sizeof(tuple_size<_Tp>)>>>
@@ -45,7 +45,7 @@ tuple_size<__enable_if_tuple_size_imp<const _Tp,
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS
+struct _CCCL_TYPE_VISIBILITY_DEFAULT
 tuple_size<__enable_if_tuple_size_imp<volatile _Tp,
                                       __enable_if_t<!is_const<_Tp>::value>,
                                       integral_constant<size_t, sizeof(tuple_size<_Tp>)>>>
@@ -53,17 +53,18 @@ tuple_size<__enable_if_tuple_size_imp<volatile _Tp,
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS
+struct _CCCL_TYPE_VISIBILITY_DEFAULT
 tuple_size<__enable_if_tuple_size_imp<const volatile _Tp, integral_constant<size_t, sizeof(tuple_size<_Tp>)>>>
     : public integral_constant<size_t, tuple_size<_Tp>::value>
 {};
 
 template <class... _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_size<tuple<_Tp...>> : public integral_constant<size_t, sizeof...(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_size<tuple<_Tp...>> : public integral_constant<size_t, sizeof...(_Tp)>
 {};
 
 template <class... _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_size<__tuple_types<_Tp...>> : public integral_constant<size_t, sizeof...(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_size<__tuple_types<_Tp...>>
+    : public integral_constant<size_t, sizeof...(_Tp)>
 {};
 
 #if _CCCL_STD_VER >= 2017
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h b/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h
index cab61b7423..fb9df178dd 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h
@@ -54,22 +54,22 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wmismatched-tags")
 
 #  define _LIBCUDACXX_SPECIALIZE_GET(__name, __base_type)                                                           \
     template <size_t _Ip>                                                                                           \
-    _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __base_type& get(__name& __val) noexcept                    \
+    _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __base_type& get(__name& __val) noexcept                        \
     {                                                                                                               \
       return _CUDA_VSTD::__get_element<_Ip>::template get<__name, __base_type>(__val);                              \
     }                                                                                                               \
     template <size_t _Ip>                                                                                           \
-    _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __base_type& get(const __name& __val) noexcept        \
+    _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __base_type& get(const __name& __val) noexcept            \
     {                                                                                                               \
       return _CUDA_VSTD::__get_element<_Ip>::template get<__name, __base_type>(__val);                              \
     }                                                                                                               \
     template <size_t _Ip>                                                                                           \
-    _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __base_type&& get(__name&& __val) noexcept                  \
+    _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __base_type&& get(__name&& __val) noexcept                      \
     {                                                                                                               \
       return _CUDA_VSTD::__get_element<_Ip>::template get<__name, __base_type>(static_cast<__name&&>(__val));       \
     }                                                                                                               \
     template <size_t _Ip>                                                                                           \
-    _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __base_type&& get(const __name&& __val) noexcept      \
+    _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __base_type&& get(const __name&& __val) noexcept          \
     {                                                                                                               \
       return _CUDA_VSTD::__get_element<_Ip>::template get<__name, __base_type>(static_cast<const __name&&>(__val)); \
     }
@@ -103,25 +103,25 @@ template <>
 struct __get_element<0>
 {
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BaseType& get(_Vec& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BaseType& get(_Vec& __val) noexcept
   {
     return __val.x;
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _BaseType& get(const _Vec& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _BaseType& get(const _Vec& __val) noexcept
   {
     return __val.x;
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BaseType&& get(_Vec&& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BaseType&& get(_Vec&& __val) noexcept
   {
     return static_cast<_BaseType&&>(__val.x);
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _BaseType&& get(const _Vec&& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _BaseType&& get(const _Vec&& __val) noexcept
   {
     return static_cast<const _BaseType&&>(__val.x);
   }
@@ -131,25 +131,25 @@ template <>
 struct __get_element<1>
 {
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BaseType& get(_Vec& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BaseType& get(_Vec& __val) noexcept
   {
     return __val.y;
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _BaseType& get(const _Vec& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _BaseType& get(const _Vec& __val) noexcept
   {
     return __val.y;
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BaseType&& get(_Vec&& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BaseType&& get(_Vec&& __val) noexcept
   {
     return static_cast<_BaseType&&>(__val.y);
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _BaseType&& get(const _Vec&& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _BaseType&& get(const _Vec&& __val) noexcept
   {
     return static_cast<const _BaseType&&>(__val.y);
   }
@@ -158,25 +158,25 @@ template <>
 struct __get_element<2>
 {
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BaseType& get(_Vec& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BaseType& get(_Vec& __val) noexcept
   {
     return __val.z;
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _BaseType& get(const _Vec& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _BaseType& get(const _Vec& __val) noexcept
   {
     return __val.z;
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BaseType&& get(_Vec&& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BaseType&& get(_Vec&& __val) noexcept
   {
     return static_cast<_BaseType&&>(__val.z);
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _BaseType&& get(const _Vec&& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _BaseType&& get(const _Vec&& __val) noexcept
   {
     return static_cast<const _BaseType&&>(__val.z);
   }
@@ -186,25 +186,25 @@ template <>
 struct __get_element<3>
 {
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BaseType& get(_Vec& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BaseType& get(_Vec& __val) noexcept
   {
     return __val.w;
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _BaseType& get(const _Vec& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _BaseType& get(const _Vec& __val) noexcept
   {
     return __val.w;
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _BaseType&& get(_Vec&& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _BaseType&& get(_Vec&& __val) noexcept
   {
     return static_cast<_BaseType&&>(__val.w);
   }
 
   template <class _Vec, class _BaseType>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _BaseType&& get(const _Vec&& __val) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _BaseType&& get(const _Vec&& __val) noexcept
   {
     return static_cast<const _BaseType&&>(__val.w);
   }
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_const.h b/libcudacxx/include/cuda/std/__type_traits/add_const.h
index 9f68eb3a4a..7f9a2e68c5 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_const.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_const.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS add_const
+struct _CCCL_TYPE_VISIBILITY_DEFAULT add_const
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE const _Tp type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_cv.h b/libcudacxx/include/cuda/std/__type_traits/add_cv.h
index 2cce5dbd29..8abbf8f2d1 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_cv.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_cv.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS add_cv
+struct _CCCL_TYPE_VISIBILITY_DEFAULT add_cv
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE const volatile _Tp type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_volatile.h b/libcudacxx/include/cuda/std/__type_traits/add_volatile.h
index 74daa84878..25a228ccb7 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_volatile.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_volatile.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS add_volatile
+struct _CCCL_TYPE_VISIBILITY_DEFAULT add_volatile
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE volatile _Tp type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/aligned_storage.h b/libcudacxx/include/cuda/std/__type_traits/aligned_storage.h
index 08fbe35dd3..0286ba5b8a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/aligned_storage.h
+++ b/libcudacxx/include/cuda/std/__type_traits/aligned_storage.h
@@ -102,7 +102,7 @@ struct __find_max_align<__type_list<_Hp, _Tp>, _Len>
 {};
 
 template <size_t _Len, size_t _Align = __find_max_align<__all_types, _Len>::value>
-struct _LIBCUDACXX_TEMPLATE_VIS aligned_storage
+struct _CCCL_TYPE_VISIBILITY_DEFAULT aligned_storage
 {
   typedef typename __find_pod<__all_types, _Align>::type _Aligner;
   union type
@@ -117,14 +117,14 @@ template <size_t _Len, size_t _Align = __find_max_align<__all_types, _Len>::valu
 using aligned_storage_t = typename aligned_storage<_Len, _Align>::type;
 #endif
 
-#define _CREATE_ALIGNED_STORAGE_SPECIALIZATION(n)          \
-  template <size_t _Len>                                   \
-  struct _LIBCUDACXX_TEMPLATE_VIS aligned_storage<_Len, n> \
-  {                                                        \
-    struct _CCCL_ALIGNAS(n) type                           \
-    {                                                      \
-      unsigned char __lx[(_Len + n - 1) / n * n];          \
-    };                                                     \
+#define _CREATE_ALIGNED_STORAGE_SPECIALIZATION(n)               \
+  template <size_t _Len>                                        \
+  struct _CCCL_TYPE_VISIBILITY_DEFAULT aligned_storage<_Len, n> \
+  {                                                             \
+    struct _CCCL_ALIGNAS(n) type                                \
+    {                                                           \
+      unsigned char __lx[(_Len + n - 1) / n * n];               \
+    };                                                          \
   }
 
 _CREATE_ALIGNED_STORAGE_SPECIALIZATION(0x1);
@@ -142,9 +142,9 @@ _CREATE_ALIGNED_STORAGE_SPECIALIZATION(0x800);
 _CREATE_ALIGNED_STORAGE_SPECIALIZATION(0x1000);
 _CREATE_ALIGNED_STORAGE_SPECIALIZATION(0x2000);
 // PE/COFF does not support alignment beyond 8192 (=0x2000)
-#if !defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
+#if !defined(_WIN32)
 _CREATE_ALIGNED_STORAGE_SPECIALIZATION(0x4000);
-#endif // !defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
+#endif // !defined(_WIN32)
 
 #undef _CREATE_ALIGNED_STORAGE_SPECIALIZATION
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/alignment_of.h b/libcudacxx/include/cuda/std/__type_traits/alignment_of.h
index 523758795f..d1459ab14f 100644
--- a/libcudacxx/include/cuda/std/__type_traits/alignment_of.h
+++ b/libcudacxx/include/cuda/std/__type_traits/alignment_of.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS alignment_of : public integral_constant<size_t, _LIBCUDACXX_ALIGNOF(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT alignment_of : public integral_constant<size_t, _LIBCUDACXX_ALIGNOF(_Tp)>
 {};
 
 #if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/common_type.h b/libcudacxx/include/cuda/std/__type_traits/common_type.h
index 65bc5faaa9..ecf8218aa7 100644
--- a/libcudacxx/include/cuda/std/__type_traits/common_type.h
+++ b/libcudacxx/include/cuda/std/__type_traits/common_type.h
@@ -29,7 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class... _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS common_type;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT common_type;
 
 template <class... _Tp>
 using __common_type_t = typename common_type<_Tp...>::type;
@@ -87,13 +87,13 @@ struct __common_type_impl<__common_types<_Tp, _Up, _Vp, _Rest...>, void_t<__comm
 // bullet 1 - sizeof...(Tp) == 0
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS common_type<>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT common_type<>
 {};
 
 // bullet 2 - sizeof...(Tp) == 1
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS common_type<_Tp> : public common_type<_Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT common_type<_Tp> : public common_type<_Tp, _Tp>
 {};
 
 // bullet 3 - sizeof...(Tp) == 2
@@ -108,13 +108,13 @@ struct __common_type2<_Tp, _Up, _Tp, _Up> : __common_type2_imp<_Tp, _Up>
 {};
 
 template <class _Tp, class _Up>
-struct _LIBCUDACXX_TEMPLATE_VIS common_type<_Tp, _Up> : __common_type2<_Tp, _Up>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT common_type<_Tp, _Up> : __common_type2<_Tp, _Up>
 {};
 
 // bullet 4 - sizeof...(Tp) > 2
 
 template <class _Tp, class _Up, class _Vp, class... _Rest>
-struct _LIBCUDACXX_TEMPLATE_VIS common_type<_Tp, _Up, _Vp, _Rest...>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT common_type<_Tp, _Up, _Vp, _Rest...>
     : __common_type_impl<__common_types<_Tp, _Up, _Vp, _Rest...>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/conditional.h b/libcudacxx/include/cuda/std/__type_traits/conditional.h
index c6a77b7b78..67140d4298 100644
--- a/libcudacxx/include/cuda/std/__type_traits/conditional.h
+++ b/libcudacxx/include/cuda/std/__type_traits/conditional.h
@@ -43,12 +43,12 @@ template <bool _Cond, class _IfRes, class _ElseRes>
 using _If _LIBCUDACXX_NODEBUG_TYPE = typename _IfImpl<_Cond>::template _Select<_IfRes, _ElseRes>;
 
 template <bool _Bp, class _If, class _Then>
-struct _LIBCUDACXX_TEMPLATE_VIS conditional
+struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional
 {
   typedef _If type;
 };
 template <class _If, class _Then>
-struct _LIBCUDACXX_TEMPLATE_VIS conditional<false, _If, _Then>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional<false, _If, _Then>
 {
   typedef _Then type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/conjunction.h b/libcudacxx/include/cuda/std/__type_traits/conjunction.h
index e944c6205e..73cfd4e547 100644
--- a/libcudacxx/include/cuda/std/__type_traits/conjunction.h
+++ b/libcudacxx/include/cuda/std/__type_traits/conjunction.h
@@ -30,10 +30,10 @@ template <class...>
 using __expand_to_true = true_type;
 
 template <class... _Pred>
-_LIBCUDACXX_INLINE_VISIBILITY __expand_to_true<__enable_if_t<_Pred::value>...> __and_helper(int);
+_CCCL_HOST_DEVICE __expand_to_true<__enable_if_t<_Pred::value>...> __and_helper(int);
 
 template <class...>
-_LIBCUDACXX_INLINE_VISIBILITY false_type __and_helper(...);
+_CCCL_HOST_DEVICE false_type __and_helper(...);
 
 // _And always performs lazy evaluation of its arguments.
 //
diff --git a/libcudacxx/include/cuda/std/__type_traits/decay.h b/libcudacxx/include/cuda/std/__type_traits/decay.h
index 172a85baf1..90a1de9f54 100644
--- a/libcudacxx/include/cuda/std/__type_traits/decay.h
+++ b/libcudacxx/include/cuda/std/__type_traits/decay.h
@@ -58,7 +58,7 @@ struct __decay_impl<_Up, true>
 };
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS decay
+struct _CCCL_TYPE_VISIBILITY_DEFAULT decay
 {
 private:
   typedef _LIBCUDACXX_NODEBUG_TYPE __libcpp_remove_reference_t<_Tp> _Up;
@@ -71,10 +71,10 @@ struct _LIBCUDACXX_TEMPLATE_VIS decay
 template <class _Tp>
 using __decay_t = typename decay<_Tp>::type;
 
-#if _CCCL_STD_VER > 2011
+#if _CCCL_STD_VER >= 2014
 template <class _Tp>
 using decay_t = typename decay<_Tp>::type;
-#endif
+#endif // _CCCL_STD_VER >= 2014
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/dependent_type.h b/libcudacxx/include/cuda/std/__type_traits/dependent_type.h
index c36c3a81c8..1d447de04b 100644
--- a/libcudacxx/include/cuda/std/__type_traits/dependent_type.h
+++ b/libcudacxx/include/cuda/std/__type_traits/dependent_type.h
@@ -23,7 +23,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, bool>
-struct _LIBCUDACXX_TEMPLATE_VIS __dependent_type : public _Tp
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __dependent_type : public _Tp
 {};
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__type_traits/enable_if.h b/libcudacxx/include/cuda/std/__type_traits/enable_if.h
index 346bedae6d..497b19d7ba 100644
--- a/libcudacxx/include/cuda/std/__type_traits/enable_if.h
+++ b/libcudacxx/include/cuda/std/__type_traits/enable_if.h
@@ -23,10 +23,10 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <bool, class _Tp = void>
-struct _LIBCUDACXX_TEMPLATE_VIS enable_if
+struct _CCCL_TYPE_VISIBILITY_DEFAULT enable_if
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS enable_if<true, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT enable_if<true, _Tp>
 {
   typedef _Tp type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/extent.h b/libcudacxx/include/cuda/std/__type_traits/extent.h
index 82a90815c4..75bf537297 100644
--- a/libcudacxx/include/cuda/std/__type_traits/extent.h
+++ b/libcudacxx/include/cuda/std/__type_traits/extent.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_ARRAY_EXTENT) && !defined(_LIBCUDACXX_USE_ARRAY_EXTENT_FALLBACK)
 
 template <class _Tp, size_t _Dim = 0>
-struct _LIBCUDACXX_TEMPLATE_VIS extent : integral_constant<size_t, _LIBCUDACXX_ARRAY_EXTENT(_Tp, _Dim)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT extent : integral_constant<size_t, _LIBCUDACXX_ARRAY_EXTENT(_Tp, _Dim)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -40,19 +40,20 @@ _LIBCUDACXX_INLINE_VAR constexpr size_t extent_v = _LIBCUDACXX_ARRAY_EXTENT(_Tp,
 #else
 
 template <class _Tp, unsigned _Ip = 0>
-struct _LIBCUDACXX_TEMPLATE_VIS extent : public integral_constant<size_t, 0>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT extent : public integral_constant<size_t, 0>
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS extent<_Tp[], 0> : public integral_constant<size_t, 0>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT extent<_Tp[], 0> : public integral_constant<size_t, 0>
 {};
 template <class _Tp, unsigned _Ip>
-struct _LIBCUDACXX_TEMPLATE_VIS extent<_Tp[], _Ip> : public integral_constant<size_t, extent<_Tp, _Ip - 1>::value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT extent<_Tp[], _Ip> : public integral_constant<size_t, extent<_Tp, _Ip - 1>::value>
 {};
 template <class _Tp, size_t _Np>
-struct _LIBCUDACXX_TEMPLATE_VIS extent<_Tp[_Np], 0> : public integral_constant<size_t, _Np>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT extent<_Tp[_Np], 0> : public integral_constant<size_t, _Np>
 {};
 template <class _Tp, size_t _Np, unsigned _Ip>
-struct _LIBCUDACXX_TEMPLATE_VIS extent<_Tp[_Np], _Ip> : public integral_constant<size_t, extent<_Tp, _Ip - 1>::value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT extent<_Tp[_Np], _Ip>
+    : public integral_constant<size_t, extent<_Tp, _Ip - 1>::value>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/has_unique_object_representation.h b/libcudacxx/include/cuda/std/__type_traits/has_unique_object_representation.h
index ac1aed012c..31ab3c3e63 100644
--- a/libcudacxx/include/cuda/std/__type_traits/has_unique_object_representation.h
+++ b/libcudacxx/include/cuda/std/__type_traits/has_unique_object_representation.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if _CCCL_STD_VER > 2011 && defined(_LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS has_unique_object_representations
+struct _CCCL_TYPE_VISIBILITY_DEFAULT has_unique_object_representations
     : public integral_constant<bool, __has_unique_object_representations(remove_cv_t<remove_all_extents_t<_Tp>>)>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/has_virtual_destructor.h b/libcudacxx/include/cuda/std/__type_traits/has_virtual_destructor.h
index 3e9bb2aa2e..04b90cfb50 100644
--- a/libcudacxx/include/cuda/std/__type_traits/has_virtual_destructor.h
+++ b/libcudacxx/include/cuda/std/__type_traits/has_virtual_destructor.h
@@ -27,14 +27,14 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_HAS_VIRTUAL_DESTRUCTOR) && !defined(_LIBCUDACXX_USE_HAS_VIRTUAL_DESTRUCTOR_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS has_virtual_destructor
+struct _CCCL_TYPE_VISIBILITY_DEFAULT has_virtual_destructor
     : public integral_constant<bool, _LIBCUDACXX_HAS_VIRTUAL_DESTRUCTOR(_Tp)>
 {};
 
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS has_virtual_destructor : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT has_virtual_destructor : public false_type
 {};
 
 #endif // defined(_LIBCUDACXX_HAS_VIRTUAL_DESTRUCTOR) && !defined(_LIBCUDACXX_USE_HAS_VIRTUAL_DESTRUCTOR_FALLBACK)
diff --git a/libcudacxx/include/cuda/std/__type_traits/integral_constant.h b/libcudacxx/include/cuda/std/__type_traits/integral_constant.h
index 6e5fadf89e..e73407aeb4 100644
--- a/libcudacxx/include/cuda/std/__type_traits/integral_constant.h
+++ b/libcudacxx/include/cuda/std/__type_traits/integral_constant.h
@@ -23,17 +23,17 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, _Tp __v>
-struct _LIBCUDACXX_TEMPLATE_VIS integral_constant
+struct _CCCL_TYPE_VISIBILITY_DEFAULT integral_constant
 {
   static constexpr const _Tp value = __v;
   typedef _Tp value_type;
   typedef integral_constant type;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr operator value_type() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator value_type() const noexcept
   {
     return value;
   }
 #if _CCCL_STD_VER > 2011
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr value_type operator()() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr value_type operator()() const noexcept
   {
     return value;
   }
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_abstract.h b/libcudacxx/include/cuda/std/__type_traits/is_abstract.h
index ce5338c071..67284b0156 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_abstract.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_abstract.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_abstract : public integral_constant<bool, __is_abstract(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_abstract : public integral_constant<bool, __is_abstract(_Tp)>
 {};
 
 #if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_aggregate.h b/libcudacxx/include/cuda/std/__type_traits/is_aggregate.h
index 7059839e9a..d1f33cbe08 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_aggregate.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_aggregate.h
@@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if _CCCL_STD_VER > 2011 && defined(_LIBCUDACXX_IS_AGGREGATE)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_aggregate : public integral_constant<bool, _LIBCUDACXX_IS_AGGREGATE(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_aggregate : public integral_constant<bool, _LIBCUDACXX_IS_AGGREGATE(_Tp)>
 {};
 
 #  if !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_arithmetic.h b/libcudacxx/include/cuda/std/__type_traits/is_arithmetic.h
index 8849609d22..df6ef7f06c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_arithmetic.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_arithmetic.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_arithmetic
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_arithmetic
     : public integral_constant<bool, is_integral<_Tp>::value || is_floating_point<_Tp>::value>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_array.h b/libcudacxx/include/cuda/std/__type_traits/is_array.h
index c5e8ded876..e4399a5212 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_array.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_array.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_ARRAY) && !defined(_LIBCUDACXX_USE_IS_ARRAY_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_array : public integral_constant<bool, _LIBCUDACXX_IS_ARRAY(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_array : public integral_constant<bool, _LIBCUDACXX_IS_ARRAY(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -41,13 +41,13 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_array_v = _LIBCUDACXX_IS_ARRAY(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_array : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_array : public false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_array<_Tp[]> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_array<_Tp[]> : public true_type
 {};
 template <class _Tp, size_t _Np>
-struct _LIBCUDACXX_TEMPLATE_VIS is_array<_Tp[_Np]> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_array<_Tp[_Np]> : public true_type
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_assignable.h
index 26835ca640..c6b0bf8a07 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_assignable.h
@@ -35,7 +35,7 @@ struct __select_2nd
 #if defined(_LIBCUDACXX_IS_ASSIGNABLE) && !defined(_LIBCUDACXX_USE_IS_ASSIGNABLE_FALLBACK)
 
 template <class _T1, class _T2>
-struct _LIBCUDACXX_TEMPLATE_VIS is_assignable : public integral_constant<bool, _LIBCUDACXX_IS_ASSIGNABLE(_T1, _T2)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_assignable : public integral_constant<bool, _LIBCUDACXX_IS_ASSIGNABLE(_T1, _T2)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -46,12 +46,12 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_assignable_v = _LIBCUDACXX_IS_ASSIGNABL
 #else
 
 template <class _Tp, class _Arg>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 typename __select_2nd<decltype((_CUDA_VSTD::declval<_Tp>() = _CUDA_VSTD::declval<_Arg>())), true_type>::type
 __is_assignable_test(int);
 
 template <class, class>
-_LIBCUDACXX_INLINE_VISIBILITY false_type __is_assignable_test(...);
+_LIBCUDACXX_HIDE_FROM_ABI false_type __is_assignable_test(...);
 
 template <class _Tp, class _Arg, bool = is_void<_Tp>::value || is_void<_Arg>::value>
 struct __is_assignable_imp : public decltype((_CUDA_VSTD::__is_assignable_test<_Tp, _Arg>(0)))
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_base_of.h b/libcudacxx/include/cuda/std/__type_traits/is_base_of.h
index 9a68666316..59c5c8ef2c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_base_of.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_base_of.h
@@ -31,7 +31,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_BASE_OF) && !defined(_LIBCUDACXX_USE_IS_BASE_OF_FALLBACK)
 
 template <class _Bp, class _Dp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_base_of : public integral_constant<bool, _LIBCUDACXX_IS_BASE_OF(_Bp, _Dp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_base_of : public integral_constant<bool, _LIBCUDACXX_IS_BASE_OF(_Bp, _Dp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -46,14 +46,14 @@ namespace __is_base_of_imp
 template <class _Tp>
 struct _Dst
 {
-  _LIBCUDACXX_INLINE_VISIBILITY _Dst(const volatile _Tp&);
+  _LIBCUDACXX_HIDE_FROM_ABI _Dst(const volatile _Tp&);
 };
 template <class _Tp>
 struct _Src
 {
-  _LIBCUDACXX_INLINE_VISIBILITY operator const volatile _Tp&();
+  _LIBCUDACXX_HIDE_FROM_ABI operator const volatile _Tp&();
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY operator const _Dst<_Up>&();
+  _LIBCUDACXX_HIDE_FROM_ABI operator const _Dst<_Up>&();
 };
 template <size_t>
 struct __one
@@ -67,7 +67,7 @@ _CCCL_HOST_DEVICE __two __test(...);
 } // namespace __is_base_of_imp
 
 template <class _Bp, class _Dp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_base_of
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_base_of
     : public integral_constant<bool, is_class<_Bp>::value && sizeof(__is_base_of_imp::__test<_Bp, _Dp>(0)) == 2>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h b/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h
index 15332cd4a0..c4aa9e40b3 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h
@@ -26,19 +26,19 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class>
-struct _LIBCUDACXX_TEMPLATE_VIS __libcpp_is_bounded_array : false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_bounded_array : false_type
 {};
 template <class _Tp, size_t _Np>
-struct _LIBCUDACXX_TEMPLATE_VIS __libcpp_is_bounded_array<_Tp[_Np]> : true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_bounded_array<_Tp[_Np]> : true_type
 {};
 
 #if _CCCL_STD_VER > 2011
 
 template <class>
-struct _LIBCUDACXX_TEMPLATE_VIS is_bounded_array : false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_bounded_array : false_type
 {};
 template <class _Tp, size_t _Np>
-struct _LIBCUDACXX_TEMPLATE_VIS is_bounded_array<_Tp[_Np]> : true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_bounded_array<_Tp[_Np]> : true_type
 {};
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_callable.h b/libcudacxx/include/cuda/std/__type_traits/is_callable.h
index 1a0d17b0d3..7a6e63bc79 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_callable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_callable.h
@@ -26,9 +26,9 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Func, class... _Args, class = decltype(_CUDA_VSTD::declval<_Func>()(_CUDA_VSTD::declval<_Args>()...))>
-_LIBCUDACXX_INLINE_VISIBILITY true_type __is_callable_helper(int);
+_LIBCUDACXX_HIDE_FROM_ABI true_type __is_callable_helper(int);
 template <class...>
-_LIBCUDACXX_INLINE_VISIBILITY false_type __is_callable_helper(...);
+_LIBCUDACXX_HIDE_FROM_ABI false_type __is_callable_helper(...);
 
 template <class _Func, class... _Args>
 struct __is_callable : decltype(__is_callable_helper<_Func, _Args...>(0))
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_class.h b/libcudacxx/include/cuda/std/__type_traits/is_class.h
index c3abe7a49f..58ca18bf65 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_class.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_class.h
@@ -33,7 +33,7 @@ struct __two
 #if defined(_LIBCUDACXX_IS_CLASS) && !defined(_LIBCUDACXX_USE_IS_CLASS_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_class : public integral_constant<bool, _LIBCUDACXX_IS_CLASS(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_class : public integral_constant<bool, _LIBCUDACXX_IS_CLASS(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -52,7 +52,7 @@ _CCCL_HOST_DEVICE __two __test(...);
 } // namespace __is_class_imp
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_class
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_class
     : public integral_constant<bool, sizeof(__is_class_imp::__test<_Tp>(0)) == 1 && !is_union<_Tp>::value>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_compound.h b/libcudacxx/include/cuda/std/__type_traits/is_compound.h
index 0c41b904dc..f400a99567 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_compound.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_compound.h
@@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_COMPOUND) && !defined(_LIBCUDACXX_USE_IS_COMPOUND_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_compound : public integral_constant<bool, _LIBCUDACXX_IS_COMPOUND(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_compound : public integral_constant<bool, _LIBCUDACXX_IS_COMPOUND(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -39,7 +39,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_compound_v = _LIBCUDACXX_IS_COMPOUND(_T
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_compound : public integral_constant<bool, !is_fundamental<_Tp>::value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_compound : public integral_constant<bool, !is_fundamental<_Tp>::value>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_const.h b/libcudacxx/include/cuda/std/__type_traits/is_const.h
index 14dd214578..7a4d6702bc 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_const.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_const.h
@@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_CONST) && !defined(_LIBCUDACXX_USE_IS_CONST_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_const : public integral_constant<bool, _LIBCUDACXX_IS_CONST(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_const : public integral_constant<bool, _LIBCUDACXX_IS_CONST(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -38,10 +38,10 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_const_v = _LIBCUDACXX_IS_CONST(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_const : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_const : public false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_const<_Tp const> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_const<_Tp const> : public true_type
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h
index 577561a6b2..4f7e166d1b 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h
@@ -25,25 +25,25 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool is_constant_evaluated() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_constant_evaluated() noexcept
 {
   return _LIBCUDACXX_IS_CONSTANT_EVALUATED();
 }
 
-inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_is_constant_evaluated() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_is_constant_evaluated() noexcept
 {
   return _LIBCUDACXX_IS_CONSTANT_EVALUATED();
 }
-inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_default_is_constant_evaluated() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_default_is_constant_evaluated() noexcept
 {
   return _LIBCUDACXX_IS_CONSTANT_EVALUATED();
 }
 #else // ^^^ _LIBCUDACXX_IS_CONSTANT_EVALUATED ^^^ / vvv !_LIBCUDACXX_IS_CONSTANT_EVALUATED vvv
-inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_is_constant_evaluated() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_is_constant_evaluated() noexcept
 {
   return false;
 }
-inline constexpr _LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_default_is_constant_evaluated() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_default_is_constant_evaluated() noexcept
 {
   return true;
 }
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_constructible.h
index 830d2c1632..ffbf02f400 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_constructible.h
@@ -76,7 +76,7 @@ struct __is_invalid_lvalue_to_rvalue_cast<_ToRef&&, _FromRef&>
 struct __is_constructible_helper
 {
   template <class _To>
-  _LIBCUDACXX_INLINE_VISIBILITY static void __eat(_To);
+  _LIBCUDACXX_HIDE_FROM_ABI static void __eat(_To);
 
   // This overload is needed to work around a Clang bug that disallows
   // static_cast<T&&>(e) for non-reference-compatible types.
@@ -84,26 +84,26 @@ struct __is_constructible_helper
   // NOTE: The static_cast implementation below is required to support
   //  classes with explicit conversion operators.
   template <class _To, class _From, class = decltype(__eat<_To>(_CUDA_VSTD::declval<_From>()))>
-  _LIBCUDACXX_INLINE_VISIBILITY static true_type __test_cast(int);
+  _LIBCUDACXX_HIDE_FROM_ABI static true_type __test_cast(int);
 
   template <class _To, class _From, class = decltype(static_cast<_To>(_CUDA_VSTD::declval<_From>()))>
-  _LIBCUDACXX_INLINE_VISIBILITY static integral_constant<
+  _LIBCUDACXX_HIDE_FROM_ABI static integral_constant<
     bool,
     !__is_invalid_base_to_derived_cast<_To, _From>::value && !__is_invalid_lvalue_to_rvalue_cast<_To, _From>::value>
   __test_cast(long);
 
   template <class, class>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test_cast(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static false_type __test_cast(...);
 
   template <class _Tp, class... _Args, class = decltype(_Tp(_CUDA_VSTD::declval<_Args>()...))>
-  _LIBCUDACXX_INLINE_VISIBILITY static true_type __test_nary(int);
+  _LIBCUDACXX_HIDE_FROM_ABI static true_type __test_nary(int);
   template <class _Tp, class...>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test_nary(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static false_type __test_nary(...);
 
   template <class _Tp, class _A0, class = decltype(::new _Tp(_CUDA_VSTD::declval<_A0>()))>
-  _LIBCUDACXX_INLINE_VISIBILITY static is_destructible<_Tp> __test_unary(int);
+  _LIBCUDACXX_HIDE_FROM_ABI static is_destructible<_Tp> __test_unary(int);
   template <class, class>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test_unary(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static false_type __test_unary(...);
 };
 
 template <class _Tp, bool = is_void<_Tp>::value>
@@ -149,7 +149,7 @@ struct __libcpp_is_constructible<_Tp&&, _A0> : public decltype(__is_constructibl
 
 #if defined(_LIBCUDACXX_IS_CONSTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_CONSTRUCTIBLE_FALLBACK)
 template <class _Tp, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_constructible
     : public integral_constant<bool, _LIBCUDACXX_IS_CONSTRUCTIBLE(_Tp, _Args...)>
 {};
 
@@ -160,7 +160,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_constructible_v = _LIBCUDACXX_IS_CONSTR
 
 #else
 template <class _Tp, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_constructible : public __libcpp_is_constructible<_Tp, _Args...>::type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_constructible : public __libcpp_is_constructible<_Tp, _Args...>::type
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_convertible.h b/libcudacxx/include/cuda/std/__type_traits/is_convertible.h
index af74ba59cb..eb61449bd1 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_convertible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_convertible.h
@@ -34,7 +34,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_CONVERTIBLE_TO) && !defined(_LIBCUDACXX_USE_IS_CONVERTIBLE_FALLBACK)
 
 template <class _T1, class _T2>
-struct _LIBCUDACXX_TEMPLATE_VIS is_convertible : public integral_constant<bool, _LIBCUDACXX_IS_CONVERTIBLE_TO(_T1, _T2)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_convertible
+    : public integral_constant<bool, _LIBCUDACXX_IS_CONVERTIBLE_TO(_T1, _T2)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -79,7 +80,7 @@ namespace __is_convertible_imp
 
 _CCCL_NV_DIAG_SUPPRESS(3013) // a volatile function parameter is deprecated
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void __test_convert(_Tp);
+_LIBCUDACXX_HIDE_FROM_ABI void __test_convert(_Tp);
 _CCCL_NV_DIAG_DEFAULT(3013) // a volatile function parameter is deprecated
 
 template <class _From, class _To, class = void>
@@ -191,7 +192,7 @@ struct __is_convertible_fallback<_T1, _T2, 3, 3> : public true_type
 {};
 
 template <class _T1, class _T2>
-struct _LIBCUDACXX_TEMPLATE_VIS is_convertible : public __is_convertible_fallback<_T1, _T2>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_convertible : public __is_convertible_fallback<_T1, _T2>
 {
   static const size_t __complete_check1 = __is_convertible_check<_T1>::__v;
   static const size_t __complete_check2 = __is_convertible_check<_T2>::__v;
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_copy_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_copy_assignable.h
index 8cb97ccb49..75d8cef6fb 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_copy_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_copy_assignable.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_copy_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_copy_assignable
     : public is_assignable<__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<typename add_const<_Tp>::type>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_copy_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_copy_constructible.h
index de9fbd10ec..2628e3f8e8 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_copy_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_copy_constructible.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_copy_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_copy_constructible
     : public is_constructible<_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_default_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_default_constructible.h
index 5488d90d05..5d4cac181b 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_default_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_default_constructible.h
@@ -25,7 +25,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_default_constructible : public is_constructible<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_default_constructible : public is_constructible<_Tp>
 {};
 
 #if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_destructible.h b/libcudacxx/include/cuda/std/__type_traits/is_destructible.h
index 891cb0f1e3..c8b48b82b8 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_destructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_destructible.h
@@ -31,7 +31,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_DESTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_DESTRUCTIBLE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_destructible : public integral_constant<bool, _LIBCUDACXX_IS_DESTRUCTIBLE(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_destructible : public integral_constant<bool, _LIBCUDACXX_IS_DESTRUCTIBLE(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -58,11 +58,11 @@ template <typename _Tp>
 struct __is_destructor_wellformed
 {
   template <typename _Tp1>
-  _LIBCUDACXX_INLINE_VISIBILITY static true_type
+  _LIBCUDACXX_HIDE_FROM_ABI static true_type
     __test(typename __is_destructible_apply<decltype(_CUDA_VSTD::declval<_Tp1&>().~_Tp1())>::type);
 
   template <typename _Tp1>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static false_type __test(...);
 
   static const bool value = decltype(__test<_Tp>(12))::value;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_empty.h b/libcudacxx/include/cuda/std/__type_traits/is_empty.h
index d77588da72..2b278d79a4 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_empty.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_empty.h
@@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_EMPTY) && !defined(_LIBCUDACXX_USE_IS_EMPTY_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_empty : public integral_constant<bool, _LIBCUDACXX_IS_EMPTY(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_empty : public integral_constant<bool, _LIBCUDACXX_IS_EMPTY(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -58,7 +58,7 @@ struct __libcpp_empty<_Tp, false> : public false_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_empty : public __libcpp_empty<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_empty : public __libcpp_empty<_Tp>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_enum.h b/libcudacxx/include/cuda/std/__type_traits/is_enum.h
index d22053d8e6..8efde09bf2 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_enum.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_enum.h
@@ -37,7 +37,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_ENUM) && !defined(_LIBCUDACXX_USE_IS_ENUM_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_enum : public integral_constant<bool, _LIBCUDACXX_IS_ENUM(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_enum : public integral_constant<bool, _LIBCUDACXX_IS_ENUM(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -48,7 +48,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_enum_v = _LIBCUDACXX_IS_ENUM(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_enum
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_enum
     : public integral_constant<
         bool,
         !is_void<_Tp>::value && !is_integral<_Tp>::value && !is_floating_point<_Tp>::value && !is_array<_Tp>::value
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_final.h b/libcudacxx/include/cuda/std/__type_traits/is_final.h
index a299c21fdb..64ab752afb 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_final.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_final.h
@@ -27,12 +27,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_FINAL)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS __libcpp_is_final : public integral_constant<bool, _LIBCUDACXX_IS_FINAL(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_final : public integral_constant<bool, _LIBCUDACXX_IS_FINAL(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_final : public integral_constant<bool, _LIBCUDACXX_IS_FINAL(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_final : public integral_constant<bool, _LIBCUDACXX_IS_FINAL(_Tp)>
 {};
 #  endif
 
@@ -44,12 +44,12 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_final_v = _LIBCUDACXX_IS_FINAL(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS __libcpp_is_final : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_final : public false_type
 {};
 
 #  if _CCCL_STD_VER > 2011
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_final : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_final : public false_type
 {};
 #  endif
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h
index b045e01731..ccf2bf9307 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h
@@ -39,7 +39,7 @@ struct __libcpp_is_floating_point<long double> : public true_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_floating_point : public __libcpp_is_floating_point<__remove_cv_t<_Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_floating_point : public __libcpp_is_floating_point<__remove_cv_t<_Tp>>
 {};
 
 #if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_function.h b/libcudacxx/include/cuda/std/__type_traits/is_function.h
index f497f0d26b..03c1526f0a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_function.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_function.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_FUNCTION) && !defined(_LIBCUDACXX_USE_IS_FUNCTION_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_function : integral_constant<bool, _LIBCUDACXX_IS_FUNCTION(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_function : integral_constant<bool, _LIBCUDACXX_IS_FUNCTION(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -40,7 +40,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_function_v = _LIBCUDACXX_IS_FUNCTION(_T
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_function
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_function
     : public integral_constant<bool, !(is_reference<_Tp>::value || is_const<const _Tp>::value)>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_fundamental.h b/libcudacxx/include/cuda/std/__type_traits/is_fundamental.h
index 87ed97984d..dd75d9c50c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_fundamental.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_fundamental.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_FUNDAMENTAL) && !defined(_LIBCUDACXX_USE_IS_FUNDAMENTAL_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_fundamental : public integral_constant<bool, _LIBCUDACXX_IS_FUNDAMENTAL(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_fundamental : public integral_constant<bool, _LIBCUDACXX_IS_FUNDAMENTAL(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -41,7 +41,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_fundamental_v = _LIBCUDACXX_IS_FUNDAMEN
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_fundamental
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_fundamental
     : public integral_constant<bool, is_void<_Tp>::value || __is_nullptr_t<_Tp>::value || is_arithmetic<_Tp>::value>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_implicitly_default_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_implicitly_default_constructible.h
index bba18b0f57..289c77ef67 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_implicitly_default_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_implicitly_default_constructible.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // Second, we implement the trait in a funny manner with two defaulted template
 // arguments to workaround Clang's PR43454.
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void __test_implicit_default_constructible(_Tp);
+_LIBCUDACXX_HIDE_FROM_ABI void __test_implicit_default_constructible(_Tp);
 
 template <class _Tp, class = void, class = typename is_default_constructible<_Tp>::type>
 struct __is_implicitly_default_constructible : false_type
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_integral.h b/libcudacxx/include/cuda/std/__type_traits/is_integral.h
index 709c64277d..5100215119 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_integral.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_integral.h
@@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_INTEGRAL) && !defined(_LIBCUDACXX_USE_IS_INTEGRAL_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_integral : public integral_constant<bool, _LIBCUDACXX_IS_INTEGRAL(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_integral : public integral_constant<bool, _LIBCUDACXX_IS_INTEGRAL(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -103,7 +103,7 @@ struct __libcpp_is_integral<__uint128_t> : public true_type
 #  endif
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_integral
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_integral
     : public integral_constant<bool, __libcpp_is_integral<__remove_cv_t<_Tp>>::value>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_literal_type.h b/libcudacxx/include/cuda/std/__type_traits/is_literal_type.h
index 593f1134a8..f4c9f0a939 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_literal_type.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_literal_type.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if defined(_LIBCUDACXX_IS_LITERAL) && !defined(_LIBCUDACXX_USE_IS_LITERAL_FALLBACK)
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS
+struct _CCCL_TYPE_VISIBILITY_DEFAULT
 _LIBCUDACXX_DEPRECATED_IN_CXX17 is_literal_type : public integral_constant<bool, _LIBCUDACXX_IS_LITERAL(_Tp)>
 {};
 
@@ -41,7 +41,7 @@ _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_INLINE_VAR constexpr bool is_literal
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_DEPRECATED_IN_CXX17 is_literal_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DEPRECATED_IN_CXX17 is_literal_type
     : public integral_constant<bool,
                                is_scalar<__remove_all_extents_t<_Tp>>::value
                                  || is_reference<__remove_all_extents_t<_Tp>>::value>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h
index fd9ccbbe00..f6963b1d96 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_MEMBER_FUNCTION_POINTER) && !defined(_LIBCUDACXX_USE_IS_MEMBER_FUNCTION_POINTER_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_member_function_pointer
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_function_pointer
     : public integral_constant<bool, _LIBCUDACXX_IS_MEMBER_FUNCTION_POINTER(_Tp)>
 {};
 
@@ -63,7 +63,7 @@ struct __libcpp_is_member_pointer<_Tp _Up::*>
 };
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_member_function_pointer
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_function_pointer
     : public integral_constant<bool, __libcpp_is_member_pointer<__remove_cv_t<_Tp>>::__is_func>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h
index 474c6dbc59..93860926f1 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_MEMBER_OBJECT_POINTER) && !defined(_LIBCUDACXX_USE_IS_MEMBER_OBJECT_POINTER_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_member_object_pointer
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_object_pointer
     : public integral_constant<bool, _LIBCUDACXX_IS_MEMBER_OBJECT_POINTER(_Tp)>
 {};
 
@@ -41,7 +41,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_member_object_pointer_v = _LIBCUDACXX_I
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_member_object_pointer
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_object_pointer
     : public integral_constant<bool, __libcpp_is_member_pointer<__remove_cv_t<_Tp>>::__is_obj>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h
index 45ad0d2436..9289061ddf 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h
@@ -29,7 +29,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_MEMBER_POINTER) && !defined(_LIBCUDACXX_USE_IS_MEMBER_POINTER_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_member_pointer : public integral_constant<bool, _LIBCUDACXX_IS_MEMBER_POINTER(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_pointer
+    : public integral_constant<bool, _LIBCUDACXX_IS_MEMBER_POINTER(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -40,7 +41,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_member_pointer_v = _LIBCUDACXX_IS_MEMBE
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_member_pointer
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_pointer
     : public integral_constant<bool, __libcpp_is_member_pointer<__remove_cv_t<_Tp>>::__is_member>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_move_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_move_assignable.h
index 6d65be3f2e..fa8cc54593 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_move_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_move_assignable.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_move_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_move_assignable
     : public is_assignable<__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_move_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_move_constructible.h
index 270f8e16e5..93c061f76e 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_move_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_move_constructible.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_move_constructible : public is_constructible<_Tp, __add_rvalue_reference_t<_Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_move_constructible : public is_constructible<_Tp, __add_rvalue_reference_t<_Tp>>
 {};
 
 #if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h
index 2aaeec3d72..ec988972df 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_NOTHROW_ASSIGNABLE) && !defined(_LIBCUDACXX_USE_IS_NOTHROW_ASSIGNABLE_FALLBACK)
 
 template <class _Tp, class _Arg>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_assignable
     : public integral_constant<bool, _LIBCUDACXX_IS_NOTHROW_ASSIGNABLE(_Tp, _Arg)>
 {};
 
@@ -54,7 +54,7 @@ struct __libcpp_is_nothrow_assignable<true, _Tp, _Arg>
 {};
 
 template <class _Tp, class _Arg>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_assignable
     : public __libcpp_is_nothrow_assignable<is_assignable<_Tp, _Arg>::value, _Tp, _Arg>
 {};
 
@@ -66,11 +66,11 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_nothrow_assignable_v = is_nothrow_assig
 #else
 
 template <class _Tp, class _Arg>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_assignable : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_assignable : public false_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_assignable<_Tp&, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_assignable<_Tp&, _Tp>
 #  if defined(_LIBCUDACXX_HAS_NOTHROW_ASSIGN) && !defined(_LIBCUDACXX_USE_HAS_NOTHROW_ASSIGN_FALLBACK)
     : integral_constant<bool, _LIBCUDACXX_HAS_NOTHROW_ASSIGN(_Tp)>
 {};
@@ -81,7 +81,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_assignable<_Tp&, _Tp>
 #  endif // defined(_LIBCUDACXX_HAS_NOTHROW_ASSIGN) && !defined(_LIBCUDACXX_USE_HAS_NOTHROW_ASSIGN_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_assignable<_Tp&, _Tp&>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_assignable<_Tp&, _Tp&>
 #  if defined(_LIBCUDACXX_HAS_NOTHROW_ASSIGN) && !defined(_LIBCUDACXX_USE_HAS_NOTHROW_ASSIGN_FALLBACK)
     : integral_constant<bool, _LIBCUDACXX_HAS_NOTHROW_ASSIGN(_Tp)>
 {};
@@ -92,7 +92,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_assignable<_Tp&, _Tp&>
 #  endif // defined(_LIBCUDACXX_HAS_NOTHROW_ASSIGN) && !defined(_LIBCUDACXX_USE_HAS_NOTHROW_ASSIGN_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_assignable<_Tp&, const _Tp&>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_assignable<_Tp&, const _Tp&>
 #  if defined(_LIBCUDACXX_HAS_NOTHROW_ASSIGN) && !defined(_LIBCUDACXX_USE_HAS_NOTHROW_ASSIGN_FALLBACK)
     : integral_constant<bool, _LIBCUDACXX_HAS_NOTHROW_ASSIGN(_Tp)>
 {};
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h
index cc92db9cef..619b731c09 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_NOTHROW_CONSTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_NOTHROW_CONSTRUCTIBLE_FALLBACK)
 
 template <class _Tp, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible
     : public integral_constant<bool, _LIBCUDACXX_IS_NOTHROW_CONSTRUCTIBLE(_Tp, _Args...)>
 {};
 
@@ -52,7 +52,7 @@ struct __libcpp_is_nothrow_constructible</*is constructible*/ true, /*is referen
 {};
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void __implicit_conversion_to(_Tp) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void __implicit_conversion_to(_Tp) noexcept
 {}
 
 template <class _Tp, class _Arg>
@@ -65,23 +65,23 @@ struct __libcpp_is_nothrow_constructible</*is constructible*/ false, _IsReferenc
 {};
 
 template <class _Tp, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible
     : __libcpp_is_nothrow_constructible<is_constructible<_Tp, _Args...>::value, is_reference<_Tp>::value, _Tp, _Args...>
 {};
 
 template <class _Tp, size_t _Ns>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible<_Tp[_Ns]>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible<_Tp[_Ns]>
     : __libcpp_is_nothrow_constructible<is_constructible<_Tp>::value, is_reference<_Tp>::value, _Tp>
 {};
 
 #  else
 
 template <class _Tp, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible : false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible : false_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible<_Tp>
 #    if defined(_LIBCUDACXX_HAS_NOTHROW_CONSTRUCTOR) && !defined(_LIBCUDACXX_USE_HAS_NOTHROW_CONSTRUCTOR_FALLBACK)
     : integral_constant<bool, _LIBCUDACXX_HAS_NOTHROW_CONSTRUCTOR(_Tp)>
 #    else
@@ -91,9 +91,9 @@ struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible<_Tp>
 
 template <class _Tp>
 #    ifndef _LIBCUDACXX_HAS_NO_RVALUE_REFERENCES
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible<_Tp, _Tp&&>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible<_Tp, _Tp&&>
 #    else
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible<_Tp, _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible<_Tp, _Tp>
 #    endif
 #    if defined(_LIBCUDACXX_HAS_NOTHROW_COPY) && !defined(_LIBCUDACXX_USE_HAS_NOTHROW_COPY_FALLBACK)
     : integral_constant<bool, _LIBCUDACXX_HAS_NOTHROW_COPY(_Tp)>
@@ -103,7 +103,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible<_Tp, _Tp>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible<_Tp, const _Tp&>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible<_Tp, const _Tp&>
 #    if defined(_LIBCUDACXX_HAS_NOTHROW_COPY) && !defined(_LIBCUDACXX_USE_HAS_NOTHROW_COPY_FALLBACK)
     : integral_constant<bool, _LIBCUDACXX_HAS_NOTHROW_COPY(_Tp)>
 #    else
@@ -112,7 +112,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible<_Tp, const _Tp&>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_constructible<_Tp, _Tp&>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible<_Tp, _Tp&>
 #    if defined(_LIBCUDACXX_HAS_NOTHROW_COPY) && !defined(_LIBCUDACXX_USE_HAS_NOTHROW_COPY_FALLBACK)
     : integral_constant<bool, _LIBCUDACXX_HAS_NOTHROW_COPY(_Tp)>
 #    else
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_convertible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_convertible.h
index 6908486c77..9ad099b5c1 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_convertible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_convertible.h
@@ -33,10 +33,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if _CCCL_STD_VER > 2011
 
 template <typename _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY static void __test_noexcept(_Tp) noexcept;
+_CCCL_HOST_DEVICE static void __test_noexcept(_Tp) noexcept;
 
 template <typename _Fm, typename _To>
-_LIBCUDACXX_INLINE_VISIBILITY static bool_constant<noexcept(_CUDA_VSTD::__test_noexcept<_To>(_CUDA_VSTD::declval<_Fm>()))>
+_CCCL_HOST_DEVICE static bool_constant<noexcept(_CUDA_VSTD::__test_noexcept<_To>(_CUDA_VSTD::declval<_Fm>()))>
 __is_nothrow_convertible_test();
 
 template <typename _Fm, typename _To>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h
index e9fc6a0eb8..17ae4287be 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_NOTHROW_ASSIGNABLE) && !defined(_LIBCUDACXX_USE_IS_NOTHROW_ASSIGNABLE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_copy_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_copy_assignable
     : public integral_constant<bool,
                                _LIBCUDACXX_IS_NOTHROW_ASSIGNABLE(
                                  __add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<typename add_const<_Tp>::type>)>
@@ -44,7 +44,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_nothrow_copy_assignable_v = _LIBCUDACXX
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_copy_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_copy_assignable
     : public is_nothrow_assignable<__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<typename add_const<_Tp>::type>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h
index ccb3227175..443d4c2748 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h
@@ -27,7 +27,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_copy_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_copy_constructible
     : public is_nothrow_constructible<_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_default_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_default_constructible.h
index eef51dfca4..9062d0c6a3 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_default_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_default_constructible.h
@@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_NOTHROW_CONSTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_NOTHROW_CONSTRUCTIBLE_FALLBACK)
 
 template <class _Tp, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_default_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_default_constructible
     : public integral_constant<bool, _LIBCUDACXX_IS_NOTHROW_CONSTRUCTIBLE(_Tp)>
 {};
 
@@ -39,7 +39,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_nothrow_default_constructible_v = _LIBC
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_default_constructible : public is_nothrow_constructible<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_default_constructible : public is_nothrow_constructible<_Tp>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h
index bb9431edc7..7bb386679e 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h
@@ -50,19 +50,19 @@ struct __libcpp_is_nothrow_destructible<_Tp, true>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_destructible : public __libcpp_is_nothrow_destructible<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible : public __libcpp_is_nothrow_destructible<_Tp>
 {};
 
 template <class _Tp, size_t _Ns>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_destructible<_Tp[_Ns]> : public is_nothrow_destructible<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible<_Tp[_Ns]> : public is_nothrow_destructible<_Tp>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_destructible<_Tp&> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible<_Tp&> : public true_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_destructible<_Tp&&> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible<_Tp&&> : public true_type
 {};
 
 #else
@@ -72,12 +72,12 @@ struct __libcpp_nothrow_destructor : public integral_constant<bool, is_scalar<_T
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_destructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible
     : public __libcpp_nothrow_destructor<__remove_all_extents_t<_Tp>>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_destructible<_Tp[]> : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible<_Tp[]> : public false_type
 {};
 
 #endif // defined(_LIBCUDACXX_IS_NOTHROW_DESTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_NOTHROW_DESTRUCTIBLE_FALLBACK)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_move_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_move_assignable.h
index 6db0868cf8..9135d764dc 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_move_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_move_assignable.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_NOTHROW_ASSIGNABLE) && !defined(_LIBCUDACXX_USE_IS_NOTHROW_ASSIGNABLE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_move_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_move_assignable
     : public integral_constant<
         bool,
         _LIBCUDACXX_IS_NOTHROW_ASSIGNABLE(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)>
@@ -44,7 +44,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_nothrow_move_assignable_v =
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_move_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_move_assignable
     : public is_nothrow_assignable<__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_move_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_move_constructible.h
index fd31f9313c..b13bcd7709 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_move_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_move_constructible.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_move_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_move_constructible
     : public is_nothrow_constructible<_Tp, __add_rvalue_reference_t<_Tp>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_null_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_null_pointer.h
index 46f2f5ca84..4e64b78928 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_null_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_null_pointer.h
@@ -34,11 +34,11 @@ struct __is_nullptr_t_impl<nullptr_t> : public true_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS __is_nullptr_t : public __is_nullptr_t_impl<__remove_cv_t<_Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __is_nullptr_t : public __is_nullptr_t_impl<__remove_cv_t<_Tp>>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_null_pointer : public __is_nullptr_t_impl<__remove_cv_t<_Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_null_pointer : public __is_nullptr_t_impl<__remove_cv_t<_Tp>>
 {};
 
 #if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_object.h b/libcudacxx/include/cuda/std/__type_traits/is_object.h
index 95afe8bbed..85dd5984fa 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_object.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_object.h
@@ -31,7 +31,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_OBJECT) && !defined(_LIBCUDACXX_USE_IS_OBJECT_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_object : public integral_constant<bool, _LIBCUDACXX_IS_OBJECT(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_object : public integral_constant<bool, _LIBCUDACXX_IS_OBJECT(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -42,7 +42,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_object_v = _LIBCUDACXX_IS_OBJECT(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_object
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_object
     : public integral_constant<bool,
                                is_scalar<_Tp>::value || is_array<_Tp>::value || is_union<_Tp>::value
                                  || is_class<_Tp>::value>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_pod.h b/libcudacxx/include/cuda/std/__type_traits/is_pod.h
index 5562c318cb..6efa1253c8 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_pod.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_pod.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_POD) && !defined(_LIBCUDACXX_USE_IS_POD_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_pod : public integral_constant<bool, _LIBCUDACXX_IS_POD(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_pod : public integral_constant<bool, _LIBCUDACXX_IS_POD(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -43,7 +43,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_pod_v = _LIBCUDACXX_IS_POD(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_pod
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_pod
     : public integral_constant<
         bool,
         is_trivially_default_constructible<_Tp>::value && is_trivially_copy_constructible<_Tp>::value
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_pointer.h
index fc500c5400..6b7ad1810a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_pointer.h
@@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_POINTER) && !defined(_LIBCUDACXX_USE_IS_POINTER_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_pointer : public integral_constant<bool, _LIBCUDACXX_IS_POINTER(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_pointer : public integral_constant<bool, _LIBCUDACXX_IS_POINTER(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -74,7 +74,7 @@ struct __libcpp_remove_objc_qualifiers<_Tp __unsafe_unretained>
 #  endif
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_pointer
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_pointer
     : public __libcpp_is_pointer<typename __libcpp_remove_objc_qualifiers<__remove_cv_t<_Tp>>::type>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_polymorphic.h b/libcudacxx/include/cuda/std/__type_traits/is_polymorphic.h
index d419db381b..70f24e71e7 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_polymorphic.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_polymorphic.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_POLYMORPHIC) && !defined(_LIBCUDACXX_USE_IS_POLYMORPHIC_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_polymorphic : public integral_constant<bool, _LIBCUDACXX_IS_POLYMORPHIC(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_polymorphic : public integral_constant<bool, _LIBCUDACXX_IS_POLYMORPHIC(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -47,7 +47,7 @@ template <typename _Tp>
 _CCCL_HOST_DEVICE __two& __is_polymorphic_impl(...);
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_polymorphic
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_polymorphic
     : public integral_constant<bool, sizeof(__is_polymorphic_impl<_Tp>(0)) == 1>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_reference.h b/libcudacxx/include/cuda/std/__type_traits/is_reference.h
index 6e24bf2cd7..f44bd17c51 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_reference.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_reference.h
@@ -29,17 +29,17 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
   && defined(_LIBCUDACXX_IS_REFERENCE) && !defined(_LIBCUDACXX_USE_IS_REFERENCE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_lvalue_reference
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_lvalue_reference
     : public integral_constant<bool, _LIBCUDACXX_IS_LVALUE_REFERENCE(_Tp)>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_rvalue_reference
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_rvalue_reference
     : public integral_constant<bool, _LIBCUDACXX_IS_RVALUE_REFERENCE(_Tp)>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_reference : public integral_constant<bool, _LIBCUDACXX_IS_REFERENCE(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_reference : public integral_constant<bool, _LIBCUDACXX_IS_REFERENCE(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -54,27 +54,27 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_reference_v = _LIBCUDACXX_IS_REFERENCE(
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_lvalue_reference : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_lvalue_reference : public false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_lvalue_reference<_Tp&> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_lvalue_reference<_Tp&> : public true_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_rvalue_reference : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_rvalue_reference : public false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_rvalue_reference<_Tp&&> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_rvalue_reference<_Tp&&> : public true_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_reference : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_reference : public false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_reference<_Tp&> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_reference<_Tp&> : public true_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_reference<_Tp&&> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_reference<_Tp&&> : public true_type
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_reference_wrapper.h b/libcudacxx/include/cuda/std/__type_traits/is_reference_wrapper.h
index bf6a490270..3ac32772c0 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_reference_wrapper.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_reference_wrapper.h
@@ -26,7 +26,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS reference_wrapper;
+class _CCCL_TYPE_VISIBILITY_DEFAULT reference_wrapper;
 
 template <class _Tp>
 struct __is_reference_wrapper_impl : public false_type
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h b/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h
index 5d6e305336..cdcafcd988 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h
@@ -35,9 +35,9 @@ struct __libcpp_is_referenceable : public integral_constant<bool, _LIBCUDACXX_IS
 struct __libcpp_is_referenceable_impl
 {
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY static _Tp& __test(int);
+  _CCCL_HOST_DEVICE static _Tp& __test(int);
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY static false_type __test(...);
+  _CCCL_HOST_DEVICE static false_type __test(...);
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_same.h b/libcudacxx/include/cuda/std/__type_traits/is_same.h
index 4c19a94598..4c892eb2d8 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_same.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_same.h
@@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_SAME) && !defined(_LIBCUDACXX_USE_IS_SAME_FALLBACK)
 
 template <class _Tp, class _Up>
-struct _LIBCUDACXX_TEMPLATE_VIS is_same : bool_constant<_LIBCUDACXX_IS_SAME(_Tp, _Up)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_same : bool_constant<_LIBCUDACXX_IS_SAME(_Tp, _Up)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -51,10 +51,10 @@ using _IsNotSame = bool_constant<!_LIBCUDACXX_IS_SAME(_Tp, _Up)>;
 #else
 
 template <class _Tp, class _Up>
-struct _LIBCUDACXX_TEMPLATE_VIS is_same : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_same : public false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_same<_Tp, _Tp> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_same<_Tp, _Tp> : public true_type
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_scalar.h b/libcudacxx/include/cuda/std/__type_traits/is_scalar.h
index c4d0faf2f6..edc4a4faef 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_scalar.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_scalar.h
@@ -33,7 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_SCALAR) && !defined(_LIBCUDACXX_USE_IS_SCALAR_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_scalar : public integral_constant<bool, _LIBCUDACXX_IS_SCALAR(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_scalar : public integral_constant<bool, _LIBCUDACXX_IS_SCALAR(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -53,14 +53,14 @@ struct __is_block<_Rp (^)(_Args...)> : true_type
 #  endif
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_scalar
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_scalar
     : public integral_constant<bool,
                                is_arithmetic<_Tp>::value || is_member_pointer<_Tp>::value || is_pointer<_Tp>::value
                                  || __is_nullptr_t<_Tp>::value || __is_block<_Tp>::value || is_enum<_Tp>::value>
 {};
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS is_scalar<nullptr_t> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_scalar<nullptr_t> : public true_type
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_scoped_enum.h b/libcudacxx/include/cuda/std/__type_traits/is_scoped_enum.h
index 390c5750b7..81bead3359 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_scoped_enum.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_scoped_enum.h
@@ -37,7 +37,7 @@ struct __is_scoped_enum_helper<_Tp, true> : public bool_constant<!is_convertible
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_scoped_enum : public __is_scoped_enum_helper<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_scoped_enum : public __is_scoped_enum_helper<_Tp>
 {};
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_signed.h b/libcudacxx/include/cuda/std/__type_traits/is_signed.h
index eb6cf15357..f0a8c49a0a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_signed.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_signed.h
@@ -23,12 +23,15 @@
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_arithmetic.h>
 
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_MSVC(4197) //  top-level volatile in cast is ignored
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if defined(_LIBCUDACXX_IS_SIGNED) && !defined(_LIBCUDACXX_USE_IS_SIGNED_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_signed : public integral_constant<bool, _LIBCUDACXX_IS_SIGNED(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_signed : public integral_constant<bool, _LIBCUDACXX_IS_SIGNED(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -55,7 +58,7 @@ struct __libcpp_is_signed<_Tp, false> : public false_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_signed : public __libcpp_is_signed<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_signed : public __libcpp_is_signed<_Tp>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -67,4 +70,6 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_signed_v = is_signed<_Tp>::value;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
+_CCCL_DIAG_POP
+
 #endif // _LIBCUDACXX___TYPE_TRAITS_IS_SIGNED_H
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_standard_layout.h b/libcudacxx/include/cuda/std/__type_traits/is_standard_layout.h
index 40a9598810..62d99c12e3 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_standard_layout.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_standard_layout.h
@@ -29,7 +29,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_STANDARD_LAYOUT) && !defined(_LIBCUDACXX_USE_IS_STANDARD_LAYOUT_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_standard_layout : public integral_constant<bool, _LIBCUDACXX_IS_STANDARD_LAYOUT(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_standard_layout
+    : public integral_constant<bool, _LIBCUDACXX_IS_STANDARD_LAYOUT(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -40,7 +41,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_standard_layout_v = _LIBCUDACXX_IS_STAN
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_standard_layout
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_standard_layout
     : integral_constant<bool, is_scalar<__remove_all_extents_t<_Tp>>::value>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_swappable.h b/libcudacxx/include/cuda/std/__type_traits/is_swappable.h
index 046d71c21f..733d17e229 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_swappable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_swappable.h
@@ -52,8 +52,8 @@ struct __hidden_friend_swap_found
 {};
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY auto __swap(_Tp& __lhs, _Tp& __rhs) -> decltype(swap(__lhs, __rhs));
-_LIBCUDACXX_INLINE_VISIBILITY auto __swap(...) -> __hidden_friend_swap_found;
+_LIBCUDACXX_HIDE_FROM_ABI auto __swap(_Tp& __lhs, _Tp& __rhs) -> decltype(swap(__lhs, __rhs));
+_LIBCUDACXX_HIDE_FROM_ABI auto __swap(...) -> __hidden_friend_swap_found;
 template <class _Tp>
 struct __has_hidden_friend_swap
     : is_same<decltype(__detect_hidden_friend_swap::__swap(_CUDA_VSTD::declval<_Tp&>(), _CUDA_VSTD::declval<_Tp&>())),
@@ -70,8 +70,8 @@ void swap(_Tp&, _Tp&) = delete;
 struct __no_adl_swap_found
 {};
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY auto __swap(_Tp& __lhs, _Tp& __rhs) -> decltype(swap(__lhs, __rhs));
-_LIBCUDACXX_INLINE_VISIBILITY auto __swap(...) -> __no_adl_swap_found;
+_LIBCUDACXX_HIDE_FROM_ABI auto __swap(_Tp& __lhs, _Tp& __rhs) -> decltype(swap(__lhs, __rhs));
+_LIBCUDACXX_HIDE_FROM_ABI auto __swap(...) -> __no_adl_swap_found;
 template <class _Tp>
 struct __has_no_adl_swap
     : is_same<decltype(__detect_adl_swap::__swap(_CUDA_VSTD::declval<_Tp&>(), _CUDA_VSTD::declval<_Tp&>())),
@@ -101,11 +101,11 @@ using __swap_result_t =
                 && _CCCL_TRAIT(is_move_assignable, _Tp)>;
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __swap_result_t<_Tp> swap(_Tp& __x, _Tp& __y) noexcept(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __swap_result_t<_Tp> swap(_Tp& __x, _Tp& __y) noexcept(
   _CCCL_TRAIT(is_nothrow_move_constructible, _Tp) && _CCCL_TRAIT(is_nothrow_move_assignable, _Tp));
 
 template <class _Tp, size_t _Np>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
 __enable_if_t<__detect_adl_swap::__has_no_adl_swap_array<_Tp, _Np>::value && __is_swappable<_Tp>::value>
   swap(_Tp (&__a)[_Np], _Tp (&__b)[_Np]) noexcept(__is_nothrow_swappable<_Tp>::value);
 
@@ -117,10 +117,10 @@ template <class _Tp, class _Up = _Tp, bool _NotVoid = !_CCCL_TRAIT(is_void, _Tp)
 struct __swappable_with
 {
   template <class _LHS, class _RHS>
-  _LIBCUDACXX_INLINE_VISIBILITY static decltype(swap(_CUDA_VSTD::declval<_LHS>(), _CUDA_VSTD::declval<_RHS>()))
+  _LIBCUDACXX_HIDE_FROM_ABI static decltype(swap(_CUDA_VSTD::declval<_LHS>(), _CUDA_VSTD::declval<_RHS>()))
   __test_swap(int);
   template <class, class>
-  _LIBCUDACXX_INLINE_VISIBILITY static __nat __test_swap(long);
+  _LIBCUDACXX_HIDE_FROM_ABI static __nat __test_swap(long);
 
   // Extra parens are needed for the C++03 definition of decltype.
   typedef decltype((__test_swap<_Tp, _Up>(0))) __swap1;
@@ -157,24 +157,24 @@ struct __is_nothrow_swappable : public integral_constant<bool, __detail::__nothr
 #if _CCCL_STD_VER > 2011
 
 template <class _Tp, class _Up>
-struct _LIBCUDACXX_TEMPLATE_VIS is_swappable_with
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_swappable_with
     : public integral_constant<bool, __detail::__swappable_with<_Tp, _Up>::value>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_swappable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_swappable
     : public __conditional_t<__libcpp_is_referenceable<_Tp>::value,
                              is_swappable_with<__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<_Tp>>,
                              false_type>
 {};
 
 template <class _Tp, class _Up>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_swappable_with
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_swappable_with
     : public integral_constant<bool, __detail::__nothrow_swappable_with<_Tp, _Up>::value>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_nothrow_swappable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_swappable
     : public __conditional_t<__libcpp_is_referenceable<_Tp>::value,
                              is_nothrow_swappable_with<__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<_Tp>>,
                              false_type>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivial.h b/libcudacxx/include/cuda/std/__type_traits/is_trivial.h
index 4f9d3acc53..e1c1889d57 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivial.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivial.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_TRIVIAL) && !defined(_LIBCUDACXX_USE_IS_TRIVIAL_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivial : public integral_constant<bool, _LIBCUDACXX_IS_TRIVIAL(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivial : public integral_constant<bool, _LIBCUDACXX_IS_TRIVIAL(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -40,7 +40,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_trivial_v = _LIBCUDACXX_IS_TRIVIAL(_Tp)
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivial
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivial
     : public integral_constant<bool, is_trivially_copyable<_Tp>::value && is_trivially_default_constructible<_Tp>::value>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_constructible.h
index 76d75a7c60..5f50a30cbd 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_constructible.h
@@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_TRIVIALLY_CONSTRUCTIBLE_FALLBACK)
 
 template <class _Tp, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_constructible
     : public integral_constant<bool, _LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE(_Tp, _Args...)>
 {};
 
@@ -41,12 +41,12 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_trivially_constructible_v =
 #else
 
 template <class _Tp, class... _Args>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_constructible : false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_constructible : false_type
 {};
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_constructible<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_constructible<_Tp>
 #  if defined(_LIBCUDACXX_HAS_TRIVIAL_CONSTRUCTOR) && !defined(_LIBCUDACXX_USE_HAS_TRIVIAL_CONSTRUCTOR_FALLBACK)
     : integral_constant<bool, _LIBCUDACXX_HAS_TRIVIAL_CONSTRUCTOR(_Tp)>
 #  else
@@ -56,16 +56,18 @@ struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_constructible<_Tp>
 _CCCL_SUPPRESS_DEPRECATED_POP
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_constructible<_Tp, _Tp&&> : integral_constant<bool, is_scalar<_Tp>::value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_constructible<_Tp, _Tp&&>
+    : integral_constant<bool, is_scalar<_Tp>::value>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_constructible<_Tp, const _Tp&>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_constructible<_Tp, const _Tp&>
     : integral_constant<bool, is_scalar<_Tp>::value>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_constructible<_Tp, _Tp&> : integral_constant<bool, is_scalar<_Tp>::value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_constructible<_Tp, _Tp&>
+    : integral_constant<bool, is_scalar<_Tp>::value>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_copy_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_copy_assignable.h
index 04843f558a..a5119ab5f5 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_copy_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_copy_assignable.h
@@ -44,7 +44,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_trivially_copy_assignable_v = _LIBCUDAC
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_copy_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_copy_assignable
     : public is_trivially_assignable<__add_lvalue_reference_t<_Tp>,
                                      __add_lvalue_reference_t<typename add_const<_Tp>::type>>
 {};
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_copy_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_copy_constructible.h
index b182d42443..40c9f83a0c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_copy_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_copy_constructible.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_TRIVIALLY_CONSTRUCTIBLE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_copy_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_copy_constructible
     : public integral_constant<
         bool,
         _LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE(_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type>)>
@@ -44,7 +44,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_trivially_copy_constructible_v =
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_copy_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_copy_constructible
     : public is_trivially_constructible<_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_copyable.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_copyable.h
index 62fc4e1adc..c9076a418e 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_copyable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_copyable.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_TRIVIALLY_COPYABLE) && !defined(_LIBCUDACXX_USE_IS_TRIVIALLY_COPYABLE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_copyable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_copyable
     : public integral_constant<bool, _LIBCUDACXX_IS_TRIVIALLY_COPYABLE(_Tp)>
 {};
 
@@ -41,7 +41,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_trivially_copyable_v = _LIBCUDACXX_IS_T
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_copyable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_copyable
     : integral_constant<bool, is_scalar<__remove_all_extents_t<_Tp>>::value>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_default_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_default_constructible.h
index 1cb841511d..e2ca300b73 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_default_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_default_constructible.h
@@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_TRIVIALLY_CONSTRUCTIBLE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_default_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_default_constructible
     : public integral_constant<bool, _LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE(_Tp)>
 {};
 
@@ -40,7 +40,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_trivially_default_constructible_v =
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_default_constructible : public is_trivially_constructible<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_default_constructible : public is_trivially_constructible<_Tp>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h
index caa93e14b3..9ea726122a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h
@@ -31,7 +31,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_TRIVIALLY_DESTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_TRIVIALLY_DESTRUCTIBLE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_destructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_destructible
     : public integral_constant<bool, _LIBCUDACXX_IS_TRIVIALLY_DESTRUCTIBLE(_Tp)>
 {};
 
@@ -39,7 +39,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_destructible
 
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_destructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_destructible
     : public integral_constant<bool, is_destructible<_Tp>::value && _LIBCUDACXX_HAS_TRIVIAL_DESTRUCTOR(_Tp)>
 {};
 _CCCL_SUPPRESS_DEPRECATED_POP
@@ -51,12 +51,12 @@ struct __libcpp_trivial_destructor : public integral_constant<bool, is_scalar<_T
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_destructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_destructible
     : public __libcpp_trivial_destructor<__remove_all_extents_t<_Tp>>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_destructible<_Tp[]> : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_destructible<_Tp[]> : public false_type
 {};
 
 #endif // defined(_LIBCUDACXX_HAS_TRIVIAL_DESTRUCTOR) && !defined(_LIBCUDACXX_USE_HAS_TRIVIAL_DESTRUCTOR_FALLBACK)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_move_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_move_assignable.h
index 845ff53f8a..907bbc685e 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_move_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_move_assignable.h
@@ -44,7 +44,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_trivially_move_assignable_v =
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_move_assignable
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_move_assignable
     : public is_trivially_assignable<__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_move_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_move_constructible.h
index cae986bde4..a78511de26 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_move_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_move_constructible.h
@@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE) && !defined(_LIBCUDACXX_USE_IS_TRIVIALLY_CONSTRUCTIBLE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_move_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_move_constructible
     : public integral_constant<bool, _LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE(_Tp, __add_rvalue_reference_t<_Tp>)>
 {};
 
@@ -41,7 +41,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_trivially_move_constructible_v =
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_trivially_move_constructible
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_move_constructible
     : public is_trivially_constructible<_Tp, __add_rvalue_reference_t<_Tp>>
 {};
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h b/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h
index 49a0c3db1a..f128316849 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h
@@ -25,19 +25,19 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class>
-struct _LIBCUDACXX_TEMPLATE_VIS __libcpp_is_unbounded_array : false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_unbounded_array : false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS __libcpp_is_unbounded_array<_Tp[]> : true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_unbounded_array<_Tp[]> : true_type
 {};
 
 #if _CCCL_STD_VER > 2011
 
 template <class>
-struct _LIBCUDACXX_TEMPLATE_VIS is_unbounded_array : false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_unbounded_array : false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_unbounded_array<_Tp[]> : true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_unbounded_array<_Tp[]> : true_type
 {};
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_union.h b/libcudacxx/include/cuda/std/__type_traits/is_union.h
index 3f60c20ea7..4c8ac27391 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_union.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_union.h
@@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_UNION) && !defined(_LIBCUDACXX_USE_IS_UNION_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_union : public integral_constant<bool, _LIBCUDACXX_IS_UNION(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_union : public integral_constant<bool, _LIBCUDACXX_IS_UNION(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -42,7 +42,7 @@ template <class _Tp>
 struct __libcpp_union : public false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_union : public __libcpp_union<__remove_cv_t<_Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_union : public __libcpp_union<__remove_cv_t<_Tp>>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h
index a5290db383..30690e2cf8 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h
@@ -24,6 +24,9 @@
 #include <cuda/std/__type_traits/is_arithmetic.h>
 #include <cuda/std/__type_traits/is_integral.h>
 
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_MSVC(4197) //  top-level volatile in cast is ignored
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Before AppleClang 14, __is_unsigned returned true for enums with signed underlying type.
@@ -31,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
   && !(defined(_LIBCUDACXX_APPLE_CLANG_VER) && _LIBCUDACXX_APPLE_CLANG_VER < 1400)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_unsigned : public integral_constant<bool, _LIBCUDACXX_IS_UNSIGNED(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_unsigned : public integral_constant<bool, _LIBCUDACXX_IS_UNSIGNED(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -58,7 +61,7 @@ struct __libcpp_is_unsigned<_Tp, false> : public false_type
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_unsigned : public __libcpp_is_unsigned<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_unsigned : public __libcpp_is_unsigned<_Tp>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -70,4 +73,6 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_unsigned_v = is_unsigned<_Tp>::value;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
+_CCCL_DIAG_POP
+
 #endif // _LIBCUDACXX___TYPE_TRAITS_IS_UNSIGNED_H
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_valid_expansion.h b/libcudacxx/include/cuda/std/__type_traits/is_valid_expansion.h
index 9b2702b5ac..72b261613e 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_valid_expansion.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_valid_expansion.h
@@ -25,9 +25,9 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <template <class...> class _Templ, class... _Args, class = _Templ<_Args...>>
-_LIBCUDACXX_INLINE_VISIBILITY true_type __sfinae_test_impl(int);
+_LIBCUDACXX_HIDE_FROM_ABI true_type __sfinae_test_impl(int);
 template <template <class...> class, class...>
-_LIBCUDACXX_INLINE_VISIBILITY false_type __sfinae_test_impl(...);
+_LIBCUDACXX_HIDE_FROM_ABI false_type __sfinae_test_impl(...);
 
 template <template <class...> class _Templ, class... _Args>
 using _IsValidExpansion _LIBCUDACXX_NODEBUG_TYPE = decltype(_CUDA_VSTD::__sfinae_test_impl<_Templ, _Args...>(0));
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_void.h b/libcudacxx/include/cuda/std/__type_traits/is_void.h
index a5f6119898..5574e3fbb9 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_void.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_void.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_VOID) && !defined(_LIBCUDACXX_USE_IS_VOID_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_void : integral_constant<bool, _LIBCUDACXX_IS_VOID(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_void : integral_constant<bool, _LIBCUDACXX_IS_VOID(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -40,7 +40,7 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_void_v = __is_void(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_void : public is_same<__remove_cv_t<_Tp>, void>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_void : public is_same<__remove_cv_t<_Tp>, void>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_volatile.h b/libcudacxx/include/cuda/std/__type_traits/is_volatile.h
index 431a880ae6..cd02b7c404 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_volatile.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_volatile.h
@@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_LIBCUDACXX_IS_VOLATILE) && !defined(_LIBCUDACXX_USE_IS_VOLATILE_FALLBACK)
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_volatile : : public integral_constant<bool, _LIBCUDACXX_IS_VOLATILE(_Tp)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_volatile : : public integral_constant<bool, _LIBCUDACXX_IS_VOLATILE(_Tp)>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -38,10 +38,10 @@ _LIBCUDACXX_INLINE_VAR constexpr bool is_volatile_v = _LIBCUDACXX_IS_VOLATILE(_T
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_volatile : public false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_volatile : public false_type
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS is_volatile<_Tp volatile> : public true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_volatile<_Tp volatile> : public true_type
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/make_unsigned.h b/libcudacxx/include/cuda/std/__type_traits/make_unsigned.h
index a67396996c..47e2b612c2 100644
--- a/libcudacxx/include/cuda/std/__type_traits/make_unsigned.h
+++ b/libcudacxx/include/cuda/std/__type_traits/make_unsigned.h
@@ -136,8 +136,7 @@ using make_unsigned_t = __make_unsigned_t<_Tp>;
 #endif
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __make_unsigned_t<_Tp>
-__to_unsigned_like(_Tp __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __make_unsigned_t<_Tp> __to_unsigned_like(_Tp __x) noexcept
 {
   return static_cast<__make_unsigned_t<_Tp>>(__x);
 }
diff --git a/libcudacxx/include/cuda/std/__type_traits/negation.h b/libcudacxx/include/cuda/std/__type_traits/negation.h
index 547b38be42..cd01236f9c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/negation.h
+++ b/libcudacxx/include/cuda/std/__type_traits/negation.h
@@ -28,13 +28,13 @@ template <class _Pred>
 struct _Not : bool_constant<!_Pred::value>
 {};
 
-#if _CCCL_STD_VER > 2011
 template <class _Tp>
 struct negation : _Not<_Tp>
 {};
+#if _CCCL_STD_VER >= 2014
 template <class _Tp>
 _LIBCUDACXX_INLINE_VAR constexpr bool negation_v = !_Tp::value;
-#endif // _CCCL_STD_VER > 2014
+#endif // _CCCL_STD_VER >= 2014
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/promote.h b/libcudacxx/include/cuda/std/__type_traits/promote.h
index 874385d80c..4e93dd6a3e 100644
--- a/libcudacxx/include/cuda/std/__type_traits/promote.h
+++ b/libcudacxx/include/cuda/std/__type_traits/promote.h
@@ -40,23 +40,23 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 struct __numeric_type
 {
-  _LIBCUDACXX_INLINE_VISIBILITY static void __test(...);
+  _LIBCUDACXX_HIDE_FROM_ABI static void __test(...);
 #ifdef _LIBCUDACXX_HAS_NVFP16
-  _LIBCUDACXX_INLINE_VISIBILITY static __half __test(__half);
+  _LIBCUDACXX_HIDE_FROM_ABI static __half __test(__half);
 #endif // _LIBCUDACXX_HAS_NVBF16
 #ifdef _LIBCUDACXX_HAS_NVBF16
-  _LIBCUDACXX_INLINE_VISIBILITY static __nv_bfloat16 __test(__nv_bfloat16);
+  _LIBCUDACXX_HIDE_FROM_ABI static __nv_bfloat16 __test(__nv_bfloat16);
 #endif // _LIBCUDACXX_HAS_NVFP16
-  _LIBCUDACXX_INLINE_VISIBILITY static float __test(float);
-  _LIBCUDACXX_INLINE_VISIBILITY static double __test(char);
-  _LIBCUDACXX_INLINE_VISIBILITY static double __test(int);
-  _LIBCUDACXX_INLINE_VISIBILITY static double __test(unsigned);
-  _LIBCUDACXX_INLINE_VISIBILITY static double __test(long);
-  _LIBCUDACXX_INLINE_VISIBILITY static double __test(unsigned long);
-  _LIBCUDACXX_INLINE_VISIBILITY static double __test(long long);
-  _LIBCUDACXX_INLINE_VISIBILITY static double __test(unsigned long long);
-  _LIBCUDACXX_INLINE_VISIBILITY static double __test(double);
-  _LIBCUDACXX_INLINE_VISIBILITY static long double __test(long double);
+  _LIBCUDACXX_HIDE_FROM_ABI static float __test(float);
+  _LIBCUDACXX_HIDE_FROM_ABI static double __test(char);
+  _LIBCUDACXX_HIDE_FROM_ABI static double __test(int);
+  _LIBCUDACXX_HIDE_FROM_ABI static double __test(unsigned);
+  _LIBCUDACXX_HIDE_FROM_ABI static double __test(long);
+  _LIBCUDACXX_HIDE_FROM_ABI static double __test(unsigned long);
+  _LIBCUDACXX_HIDE_FROM_ABI static double __test(long long);
+  _LIBCUDACXX_HIDE_FROM_ABI static double __test(unsigned long long);
+  _LIBCUDACXX_HIDE_FROM_ABI static double __test(double);
+  _LIBCUDACXX_HIDE_FROM_ABI static long double __test(long double);
 
   typedef decltype(__test(declval<_Tp>())) type;
   static const bool value = _IsNotSame<type, void>::value;
diff --git a/libcudacxx/include/cuda/std/__type_traits/rank.h b/libcudacxx/include/cuda/std/__type_traits/rank.h
index ea8a8dceea..8c007eb605 100644
--- a/libcudacxx/include/cuda/std/__type_traits/rank.h
+++ b/libcudacxx/include/cuda/std/__type_traits/rank.h
@@ -39,13 +39,13 @@ _LIBCUDACXX_INLINE_VAR constexpr size_t rank_v = _LIBCUDACXX_ARRAY_RANK(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS rank : public integral_constant<size_t, 0>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT rank : public integral_constant<size_t, 0>
 {};
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS rank<_Tp[]> : public integral_constant<size_t, rank<_Tp>::value + 1>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT rank<_Tp[]> : public integral_constant<size_t, rank<_Tp>::value + 1>
 {};
 template <class _Tp, size_t _Np>
-struct _LIBCUDACXX_TEMPLATE_VIS rank<_Tp[_Np]> : public integral_constant<size_t, rank<_Tp>::value + 1>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT rank<_Tp[_Np]> : public integral_constant<size_t, rank<_Tp>::value + 1>
 {};
 
 #  if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_all_extents.h b/libcudacxx/include/cuda/std/__type_traits/remove_all_extents.h
index 647f751052..9facb3b0a6 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_all_extents.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_all_extents.h
@@ -37,17 +37,17 @@ using __remove_all_extents_t = _LIBCUDACXX_REMOVE_ALL_EXTENTS(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_all_extents
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_all_extents
 {
   typedef _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_all_extents<_Tp[]>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_all_extents<_Tp[]>
 {
   typedef typename remove_all_extents<_Tp>::type type;
 };
 template <class _Tp, size_t _Np>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_all_extents<_Tp[_Np]>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_all_extents<_Tp[_Np]>
 {
   typedef typename remove_all_extents<_Tp>::type type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_const.h b/libcudacxx/include/cuda/std/__type_traits/remove_const.h
index 75ff439c04..5a7946323d 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_const.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_const.h
@@ -35,12 +35,12 @@ using __remove_const_t = _LIBCUDACXX_REMOVE_CONST(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_const
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_const
 {
   typedef _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_const<const _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_const<const _Tp>
 {
   typedef _Tp type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_cv.h b/libcudacxx/include/cuda/std/__type_traits/remove_cv.h
index 3bdc0b041c..159e831670 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_cv.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_cv.h
@@ -38,7 +38,7 @@ using __remove_cv_t = _LIBCUDACXX_REMOVE_CV(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_cv
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_cv
 {
   typedef __remove_volatile_t<__remove_const_t<_Tp>> type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_extent.h b/libcudacxx/include/cuda/std/__type_traits/remove_extent.h
index f7309e6dfd..3304dc8df1 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_extent.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_extent.h
@@ -36,17 +36,17 @@ using __remove_extent_t = _LIBCUDACXX_REMOVE_EXTENT(_Tp);
 
 #else
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_extent
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_extent
 {
   typedef _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_extent<_Tp[]>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_extent<_Tp[]>
 {
   typedef _Tp type;
 };
 template <class _Tp, size_t _Np>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_extent<_Tp[_Np]>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_extent<_Tp[_Np]>
 {
   typedef _Tp type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_pointer.h b/libcudacxx/include/cuda/std/__type_traits/remove_pointer.h
index 496025f6a7..fb5a91f5c8 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_pointer.h
@@ -34,27 +34,27 @@ using __remove_pointer_t = _LIBCUDACXX_REMOVE_POINTER(_Tp);
 
 #else
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_pointer
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_pointer<_Tp*>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer<_Tp*>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_pointer<_Tp* const>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer<_Tp* const>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_pointer<_Tp* volatile>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer<_Tp* volatile>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_pointer<_Tp* const volatile>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_pointer<_Tp* const volatile>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _Tp type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_reference.h b/libcudacxx/include/cuda/std/__type_traits/remove_reference.h
index 31e3f29dbe..cb31b6f5b7 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_reference.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_reference.h
@@ -37,17 +37,17 @@ using __libcpp_remove_reference_t = _LIBCUDACXX_REMOVE_REFERENCE_T(_Tp);
 #else
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_reference
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_reference
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_reference<_Tp&>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_reference<_Tp&>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_reference<_Tp&&>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_reference<_Tp&&>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _Tp type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/remove_volatile.h b/libcudacxx/include/cuda/std/__type_traits/remove_volatile.h
index fbd32da3d9..4d2612d717 100644
--- a/libcudacxx/include/cuda/std/__type_traits/remove_volatile.h
+++ b/libcudacxx/include/cuda/std/__type_traits/remove_volatile.h
@@ -34,12 +34,12 @@ using __remove_volatile_t = _LIBCUDACXX_REMOVE_VOLATILE(_Tp);
 
 #else
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_volatile
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_volatile
 {
   typedef _Tp type;
 };
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS remove_volatile<volatile _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT remove_volatile<volatile _Tp>
 {
   typedef _Tp type;
 };
diff --git a/libcudacxx/include/cuda/std/__type_traits/result_of.h b/libcudacxx/include/cuda/std/__type_traits/result_of.h
index b2f8dfeb7d..3926cab141 100644
--- a/libcudacxx/include/cuda/std/__type_traits/result_of.h
+++ b/libcudacxx/include/cuda/std/__type_traits/result_of.h
@@ -32,7 +32,7 @@ class result_of;
 
 template <class _Fp, class... _Args>
 class _LIBCUDACXX_DEPRECATED_IN_CXX17
-_LIBCUDACXX_TEMPLATE_VIS result_of<_Fp(_Args...)> : public __invoke_of<_Fp, _Args...>
+_CCCL_TYPE_VISIBILITY_DEFAULT result_of<_Fp(_Args...)> : public __invoke_of<_Fp, _Args...>
 {};
 
 #  if _CCCL_STD_VER > 2011
diff --git a/libcudacxx/include/cuda/std/__type_traits/type_set.h b/libcudacxx/include/cuda/std/__type_traits/type_set.h
new file mode 100644
index 0000000000..5ea6363e85
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__type_traits/type_set.h
@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___TYPE_TRAITS_TYPE_SET_H
+#define _LIBCUDACXX___TYPE_TRAITS_TYPE_SET_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/all_of.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_base_of.h>
+#include <cuda/std/__type_traits/type_identity.h>
+#include <cuda/std/cstddef>
+
+#if _CCCL_STD_VER >= 2014
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Set, class... _Ty>
+_LIBCUDACXX_INLINE_VAR constexpr bool __type_set_contains =
+  _CUDA_VSTD::__all_of<_CCCL_TRAIT(_CUDA_VSTD::is_base_of, __type_identity<_Ty>, _Set)...>;
+
+namespace __set
+{
+template <class... _Ts>
+struct __tupl
+{
+  static constexpr size_t __size() noexcept
+  {
+    return 0;
+  }
+};
+
+template <class _Ty, class... _Ts>
+struct __tupl<_Ty, _Ts...>
+    : __type_identity<_Ty>
+    , __tupl<_Ts...>
+{
+  static constexpr size_t __size() noexcept
+  {
+    return sizeof...(_Ts) + 1;
+  }
+};
+
+template <class _Ty, class... _Elements>
+using __insert = _If<__type_set_contains<__tupl<_Elements...>, _Ty>, __tupl<_Elements...>, __tupl<_Ty, _Elements...>>;
+
+struct __bulk_insert
+{
+  template <class... _Ts>
+  static auto __call(__tupl<_Ts...>*) -> __tupl<_Ts...>;
+
+  template <class _Ap, class... _Us, class... _Ts, class _SetInsert = __bulk_insert>
+  static auto __call(__tupl<_Ts...>*)
+    -> decltype(_SetInsert::template __call<_Us...>(static_cast<__insert<_Ap, _Ts...>*>(nullptr)));
+};
+} // namespace __set
+
+template <class _ExpectedSet, class... _Ts>
+_LIBCUDACXX_INLINE_VAR constexpr bool __type_set_eq = //
+  (sizeof...(_Ts) == _ExpectedSet::__size()) && __type_set_contains<_ExpectedSet, _Ts...>;
+
+template <class... _Ts>
+using __type_set = __set::__tupl<_Ts...>;
+
+template <class _Set, class... _Ts>
+using __type_set_insert = decltype(__set::__bulk_insert::__call<_Ts...>(static_cast<_Set*>(nullptr)));
+
+template <class... _Ts>
+using __make_type_set = __type_set_insert<__type_set<>, _Ts...>;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _CCCL_STD_VER >= 2014
+
+#endif // _LIBCUDACXX___TYPE_TRAITS_TYPE_SET_H
diff --git a/libcudacxx/include/cuda/std/__utility/as_const.h b/libcudacxx/include/cuda/std/__utility/as_const.h
index cad555e944..c5fe4efb17 100644
--- a/libcudacxx/include/cuda/std/__utility/as_const.h
+++ b/libcudacxx/include/cuda/std/__utility/as_const.h
@@ -26,7 +26,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if _CCCL_STD_VER > 2011
 template <class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr add_const_t<_Tp>& as_const(_Tp& __t) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr add_const_t<_Tp>& as_const(_Tp& __t) noexcept
 {
   return __t;
 }
diff --git a/libcudacxx/include/cuda/std/__utility/cmp.h b/libcudacxx/include/cuda/std/__utility/cmp.h
index 43607db2fb..6aecd0a1e5 100644
--- a/libcudacxx/include/cuda/std/__utility/cmp.h
+++ b/libcudacxx/include/cuda/std/__utility/cmp.h
@@ -57,7 +57,7 @@ concept __is_safe_integral_cmp =
                    >::value;
 
 template <__is_safe_integral_cmp _Tp, __is_safe_integral_cmp _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool cmp_equal(_Tp __t, _Up __u) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool cmp_equal(_Tp __t, _Up __u) noexcept
 {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
   {
@@ -74,13 +74,13 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr bool cmp_equal(_Tp __t, _Up __u) noexcep
 }
 
 template <__is_safe_integral_cmp _Tp, __is_safe_integral_cmp _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool cmp_not_equal(_Tp __t, _Up __u) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool cmp_not_equal(_Tp __t, _Up __u) noexcept
 {
   return !_CUDA_VSTD::cmp_equal(__t, __u);
 }
 
 template <__is_safe_integral_cmp _Tp, __is_safe_integral_cmp _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool cmp_less(_Tp __t, _Up __u) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool cmp_less(_Tp __t, _Up __u) noexcept
 {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
   {
@@ -97,25 +97,25 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr bool cmp_less(_Tp __t, _Up __u) noexcept
 }
 
 template <__is_safe_integral_cmp _Tp, __is_safe_integral_cmp _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool cmp_greater(_Tp __t, _Up __u) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool cmp_greater(_Tp __t, _Up __u) noexcept
 {
   return _CUDA_VSTD::cmp_less(__u, __t);
 }
 
 template <__is_safe_integral_cmp _Tp, __is_safe_integral_cmp _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool cmp_less_equal(_Tp __t, _Up __u) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool cmp_less_equal(_Tp __t, _Up __u) noexcept
 {
   return !_CUDA_VSTD::cmp_greater(__t, __u);
 }
 
 template <__is_safe_integral_cmp _Tp, __is_safe_integral_cmp _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool cmp_greater_equal(_Tp __t, _Up __u) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool cmp_greater_equal(_Tp __t, _Up __u) noexcept
 {
   return !_CUDA_VSTD::cmp_less(__t, __u);
 }
 
 template <__is_safe_integral_cmp _Tp, __is_safe_integral_cmp _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool in_range(_Up __u) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool in_range(_Up __u) noexcept
 {
   return _CUDA_VSTD::cmp_less_equal(__u, numeric_limits<_Tp>::max())
       && _CUDA_VSTD::cmp_greater_equal(__u, numeric_limits<_Tp>::min());
diff --git a/libcudacxx/include/cuda/std/__utility/convert_to_integral.h b/libcudacxx/include/cuda/std/__utility/convert_to_integral.h
index 4862208df6..9d862f05d4 100644
--- a/libcudacxx/include/cuda/std/__utility/convert_to_integral.h
+++ b/libcudacxx/include/cuda/std/__utility/convert_to_integral.h
@@ -27,50 +27,50 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr int __convert_to_integral(int __val)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __convert_to_integral(int __val)
 {
   return __val;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr unsigned __convert_to_integral(unsigned __val)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr unsigned __convert_to_integral(unsigned __val)
 {
   return __val;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr long __convert_to_integral(long __val)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr long __convert_to_integral(long __val)
 {
   return __val;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr unsigned long __convert_to_integral(unsigned long __val)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr unsigned long __convert_to_integral(unsigned long __val)
 {
   return __val;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr long long __convert_to_integral(long long __val)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr long long __convert_to_integral(long long __val)
 {
   return __val;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr unsigned long long __convert_to_integral(unsigned long long __val)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr unsigned long long __convert_to_integral(unsigned long long __val)
 {
   return __val;
 }
 
 template <typename _Fp>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<is_floating_point<_Fp>::value, long long>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_floating_point<_Fp>::value, long long>
 __convert_to_integral(_Fp __val)
 {
   return __val;
 }
 
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __int128_t __convert_to_integral(__int128_t __val)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __int128_t __convert_to_integral(__int128_t __val)
 {
   return __val;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __uint128_t __convert_to_integral(__uint128_t __val)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __uint128_t __convert_to_integral(__uint128_t __val)
 {
   return __val;
 }
@@ -88,7 +88,7 @@ struct __sfinae_underlying_type<_Tp, false>
 {};
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr typename __sfinae_underlying_type<_Tp>::__promoted_type
+_LIBCUDACXX_HIDE_FROM_ABI constexpr typename __sfinae_underlying_type<_Tp>::__promoted_type
 __convert_to_integral(_Tp __val)
 {
   return __val;
diff --git a/libcudacxx/include/cuda/std/__utility/declval.h b/libcudacxx/include/cuda/std/__utility/declval.h
index d3ba665889..1c466d9245 100644
--- a/libcudacxx/include/cuda/std/__utility/declval.h
+++ b/libcudacxx/include/cuda/std/__utility/declval.h
@@ -26,13 +26,13 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // from volatile-qualified types _Tp.
 _CCCL_SUPPRESS_DEPRECATED_PUSH
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp&& __declval(int);
+_LIBCUDACXX_HIDE_FROM_ABI _Tp&& __declval(int);
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp __declval(long);
+_LIBCUDACXX_HIDE_FROM_ABI _Tp __declval(long);
 _CCCL_SUPPRESS_DEPRECATED_POP
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY decltype(_CUDA_VSTD::__declval<_Tp>(0)) declval() noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI decltype(_CUDA_VSTD::__declval<_Tp>(0)) declval() noexcept;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__utility/exception_guard.h b/libcudacxx/include/cuda/std/__utility/exception_guard.h
index 534c18ec9a..aa5457badc 100644
--- a/libcudacxx/include/cuda/std/__utility/exception_guard.h
+++ b/libcudacxx/include/cuda/std/__utility/exception_guard.h
@@ -47,7 +47,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // less common, especially one that tries to catch an exception through -fno-exceptions code.
 //
 // __exception_guard can help greatly simplify code that would normally be cluttered by
-// `#if _LIBCUDACXX_NO_EXCEPTIONS`. For example:
+// `#if _CCCL_NO_EXCEPTIONS`. For example:
 //
 //    template <class Iterator, class Size, class OutputIterator>
 //    Iterator uninitialized_copy_n(Iterator iter, Size n, OutputIterator out) {
@@ -69,12 +69,12 @@ struct __exception_guard_exceptions
 {
   __exception_guard_exceptions() = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 explicit __exception_guard_exceptions(_Rollback __rollback)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 explicit __exception_guard_exceptions(_Rollback __rollback)
       : __rollback_(_CUDA_VSTD::move(__rollback))
       , __completed_(false)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __exception_guard_exceptions(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __exception_guard_exceptions(
     __exception_guard_exceptions&& __other) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Rollback))
       : __rollback_(_CUDA_VSTD::move(__other.__rollback_))
       , __completed_(__other.__completed_)
@@ -86,12 +86,12 @@ struct __exception_guard_exceptions
   __exception_guard_exceptions& operator=(__exception_guard_exceptions const&) = delete;
   __exception_guard_exceptions& operator=(__exception_guard_exceptions&&)      = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void __complete() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __complete() noexcept
   {
     __completed_ = true;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__exception_guard_exceptions()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__exception_guard_exceptions()
   {
     if (!__completed_)
     {
@@ -110,11 +110,11 @@ template <class _Rollback>
 struct __exception_guard_noexceptions
 {
   __exception_guard_noexceptions() = delete;
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX20 _LIBCUDACXX_NODEBUG_TYPE explicit __exception_guard_noexceptions(_Rollback)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _LIBCUDACXX_NODEBUG_TYPE __exception_guard_noexceptions(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _LIBCUDACXX_NODEBUG_TYPE __exception_guard_noexceptions(
     __exception_guard_noexceptions&& __other) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Rollback))
       : __completed_(__other.__completed_)
   {
@@ -125,12 +125,12 @@ struct __exception_guard_noexceptions
   __exception_guard_noexceptions& operator=(__exception_guard_noexceptions const&) = delete;
   __exception_guard_noexceptions& operator=(__exception_guard_noexceptions&&)      = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _LIBCUDACXX_NODEBUG_TYPE void __complete() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _LIBCUDACXX_NODEBUG_TYPE void __complete() noexcept
   {
     __completed_ = true;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 _LIBCUDACXX_NODEBUG_TYPE ~__exception_guard_noexceptions()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _LIBCUDACXX_NODEBUG_TYPE ~__exception_guard_noexceptions()
   {
     _LIBCUDACXX_ASSERT(__completed_, "__exception_guard not completed with exceptions disabled");
   }
@@ -141,16 +141,16 @@ struct __exception_guard_noexceptions
 
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(__exception_guard_noexceptions);
 
-#ifdef _LIBCUDACXX_NO_EXCEPTIONS
+#ifdef _CCCL_NO_EXCEPTIONS
 template <class _Rollback>
 using __exception_guard = __exception_guard_noexceptions<_Rollback>;
-#else
+#else // ^^^ _CCCL_NO_EXCEPTIONS ^^^ / vvv !_CCCL_NO_EXCEPTIONS vvv
 template <class _Rollback>
 using __exception_guard = __exception_guard_exceptions<_Rollback>;
-#endif
+#endif // !_CCCL_NO_EXCEPTIONS
 
 template <class _Rollback>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __exception_guard<_Rollback> __make_exception_guard(_Rollback __rollback)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __exception_guard<_Rollback> __make_exception_guard(_Rollback __rollback)
 {
   return __exception_guard<_Rollback>(_CUDA_VSTD::move(__rollback));
 }
diff --git a/libcudacxx/include/cuda/std/__utility/exchange.h b/libcudacxx/include/cuda/std/__utility/exchange.h
index 0de9531b23..1012380c96 100644
--- a/libcudacxx/include/cuda/std/__utility/exchange.h
+++ b/libcudacxx/include/cuda/std/__utility/exchange.h
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if _CCCL_STD_VER > 2011
 template <class _T1, class _T2 = _T1>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _T1 exchange(_T1& __obj, _T2&& __new_value) noexcept(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _T1 exchange(_T1& __obj, _T2&& __new_value) noexcept(
   is_nothrow_move_constructible<_T1>::value && is_nothrow_assignable<_T1&, _T2>::value)
 {
   _T1 __old_value = _CUDA_VSTD::move(__obj);
diff --git a/libcudacxx/include/cuda/std/__utility/forward.h b/libcudacxx/include/cuda/std/__utility/forward.h
index 85f2f7c176..a2e10dc3c7 100644
--- a/libcudacxx/include/cuda/std/__utility/forward.h
+++ b/libcudacxx/include/cuda/std/__utility/forward.h
@@ -28,15 +28,13 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp&&
-forward(__libcpp_remove_reference_t<_Tp>& __t) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& forward(__libcpp_remove_reference_t<_Tp>& __t) noexcept
 {
   return static_cast<_Tp&&>(__t);
 }
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp&&
-forward(__libcpp_remove_reference_t<_Tp>&& __t) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& forward(__libcpp_remove_reference_t<_Tp>&& __t) noexcept
 {
   static_assert(!is_lvalue_reference<_Tp>::value, "cannot forward an rvalue as an lvalue");
   return static_cast<_Tp&&>(__t);
diff --git a/libcudacxx/include/cuda/std/__utility/forward_like.h b/libcudacxx/include/cuda/std/__utility/forward_like.h
index f513c63485..f57fc036ec 100644
--- a/libcudacxx/include/cuda/std/__utility/forward_like.h
+++ b/libcudacxx/include/cuda/std/__utility/forward_like.h
@@ -40,8 +40,7 @@ template <class _Ap, class _Bp>
 using _ForwardLike = _OverrideRef<_Ap&&, _CopyConst<remove_reference_t<_Ap>, remove_reference_t<_Bp>>>;
 
 template <class _Tp, class _Up>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr auto
-forward_like(_Up&& __ux) noexcept -> _ForwardLike<_Tp, _Up>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto forward_like(_Up&& __ux) noexcept -> _ForwardLike<_Tp, _Up>
 {
   return static_cast<_ForwardLike<_Tp, _Up>>(__ux);
 }
diff --git a/libcudacxx/include/cuda/std/__utility/in_place.h b/libcudacxx/include/cuda/std/__utility/in_place.h
index cbd0aeb90e..4b68a95ad0 100644
--- a/libcudacxx/include/cuda/std/__utility/in_place.h
+++ b/libcudacxx/include/cuda/std/__utility/in_place.h
@@ -28,24 +28,24 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if _CCCL_STD_VER > 2011
 
-struct _LIBCUDACXX_TYPE_VIS in_place_t
+struct _CCCL_TYPE_VISIBILITY_DEFAULT in_place_t
 {
-  explicit in_place_t() = default;
+  _CCCL_HIDE_FROM_ABI explicit in_place_t() = default;
 };
 _CCCL_GLOBAL_CONSTANT in_place_t in_place{};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS in_place_type_t
+struct _CCCL_TYPE_VISIBILITY_DEFAULT in_place_type_t
 {
-  explicit in_place_type_t() = default;
+  _CCCL_HIDE_FROM_ABI explicit in_place_type_t() = default;
 };
 template <class _Tp>
 _LIBCUDACXX_INLINE_VAR constexpr in_place_type_t<_Tp> in_place_type{};
 
 template <size_t _Idx>
-struct _LIBCUDACXX_TEMPLATE_VIS in_place_index_t
+struct _CCCL_TYPE_VISIBILITY_DEFAULT in_place_index_t
 {
-  explicit in_place_index_t() = default;
+  _CCCL_HIDE_FROM_ABI explicit in_place_index_t() = default;
 };
 template <size_t _Idx>
 _LIBCUDACXX_INLINE_VAR constexpr in_place_index_t<_Idx> in_place_index{};
diff --git a/libcudacxx/include/cuda/std/__utility/integer_sequence.h b/libcudacxx/include/cuda/std/__utility/integer_sequence.h
index 7ce5b007c3..a4feb96432 100644
--- a/libcudacxx/include/cuda/std/__utility/integer_sequence.h
+++ b/libcudacxx/include/cuda/std/__utility/integer_sequence.h
@@ -33,13 +33,26 @@ template <class _IdxType, _IdxType... _Values>
 struct __integer_sequence
 {
   template <template <class _OIdxType, _OIdxType...> class _ToIndexSeq, class _ToIndexType>
-  using __convert = _ToIndexSeq<_ToIndexType, _Values...>;
+  using __convert _LIBCUDACXX_NODEBUG_TYPE = _ToIndexSeq<_ToIndexType, _Values...>;
 
   template <size_t _Sp>
-  using __to_tuple_indices = __tuple_indices<(_Values + _Sp)...>;
+  using __to_tuple_indices _LIBCUDACXX_NODEBUG_TYPE = __tuple_indices<(_Values + _Sp)...>;
 };
 
-#ifndef _LIBCUDACXX_HAS_MAKE_INTEGER_SEQ
+#if defined(_LIBCUDACXX_HAS_MAKE_INTEGER_SEQ)
+
+template <size_t _Ep, size_t _Sp>
+using __make_indices_imp _LIBCUDACXX_NODEBUG_TYPE =
+  typename __make_integer_seq<__integer_sequence, size_t, _Ep - _Sp>::template __to_tuple_indices<_Sp>;
+
+#elif defined(_LIBCUDACXX_HAS_INTEGER_PACK) // ^^^ _LIBCUDACXX_HAS_MAKE_INTEGER_SEQ ^^^
+                                            // vvv _LIBCUDACXX_HAS_INTEGER_PACK vvv
+
+template <size_t _Ep, size_t _Sp>
+using __make_indices_imp _LIBCUDACXX_NODEBUG_TYPE =
+  typename __integer_sequence<size_t, __integer_pack(_Ep - _Sp)...>::template __to_tuple_indices<_Sp>;
+
+#else // ^^^ _LIBCUDACXX_HAS_INTEGER_PACK ^^^ / vvv !_LIBCUDACXX_HAS_INTEGER_PACK vvv
 
 namespace __detail
 {
@@ -65,6 +78,7 @@ struct __repeat<__integer_sequence<_Tp, _Np...>, _Extra...>
 
 template <size_t _Np>
 struct __parity;
+
 template <size_t _Np>
 struct __make : __parity<_Np % 8>::template __pmake<_Np>
 {};
@@ -170,24 +184,18 @@ struct __parity<7>
 
 } // namespace __detail
 
-#endif // !_LIBCUDACXX_HAS_MAKE_INTEGER_SEQ
-
-#ifdef _LIBCUDACXX_HAS_MAKE_INTEGER_SEQ
-template <size_t _Ep, size_t _Sp>
-using __make_indices_imp =
-  typename __make_integer_seq<__integer_sequence, size_t, _Ep - _Sp>::template __to_tuple_indices<_Sp>;
-#else
 template <size_t _Ep, size_t _Sp>
-using __make_indices_imp = typename __detail::__make<_Ep - _Sp>::type::template __to_tuple_indices<_Sp>;
+using __make_indices_imp _LIBCUDACXX_NODEBUG_TYPE =
+  typename __detail::__make<_Ep - _Sp>::type::template __to_tuple_indices<_Sp>;
 
 #endif
 
 template <class _Tp, _Tp... _Ip>
-struct _LIBCUDACXX_TEMPLATE_VIS integer_sequence
+struct _CCCL_TYPE_VISIBILITY_DEFAULT integer_sequence
 {
   typedef _Tp value_type;
   static_assert(is_integral<_Tp>::value, "std::integer_sequence can only be instantiated with an integral type");
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t size() noexcept
   {
     return sizeof...(_Ip);
   }
@@ -201,7 +209,13 @@ using index_sequence = integer_sequence<size_t, _Ip...>;
 template <class _Tp, _Tp _Ep>
 using __make_integer_sequence _LIBCUDACXX_NODEBUG_TYPE = __make_integer_seq<integer_sequence, _Tp, _Ep>;
 
-#else // _LIBCUDACXX_HAS_MAKE_INTEGER_SEQ
+#elif defined(_LIBCUDACXX_HAS_INTEGER_PACK) // ^^^ _LIBCUDACXX_HAS_MAKE_INTEGER_SEQ ^^^
+                                            // vvv _LIBCUDACXX_HAS_INTEGER_PACK vvv
+
+template <class _Tp, _Tp _Ep>
+using __make_integer_sequence _LIBCUDACXX_NODEBUG_TYPE = integer_sequence<_Tp, __integer_pack(_Ep)...>;
+
+#else // ^^^ _LIBCUDACXX_HAS_INTEGER_PACK ^^^ / vvv !_LIBCUDACXX_HAS_INTEGER_PACK vvv
 
 template <typename _Tp, _Tp _Np>
 using __make_integer_sequence_unchecked _LIBCUDACXX_NODEBUG_TYPE =
diff --git a/libcudacxx/include/cuda/std/__utility/move.h b/libcudacxx/include/cuda/std/__utility/move.h
index 62b7edb4d5..edfea007a8 100644
--- a/libcudacxx/include/cuda/std/__utility/move.h
+++ b/libcudacxx/include/cuda/std/__utility/move.h
@@ -29,8 +29,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __libcpp_remove_reference_t<_Tp>&&
-move(_Tp&& __t) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __libcpp_remove_reference_t<_Tp>&& move(_Tp&& __t) noexcept
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE __libcpp_remove_reference_t<_Tp> _Up;
   return static_cast<_Up&&>(__t);
@@ -41,7 +40,7 @@ using __move_if_noexcept_result_t =
   __conditional_t<!is_nothrow_move_constructible<_Tp>::value && is_copy_constructible<_Tp>::value, const _Tp&, _Tp&&>;
 
 template <class _Tp>
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __move_if_noexcept_result_t<_Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __move_if_noexcept_result_t<_Tp>
 move_if_noexcept(_Tp& __x) noexcept
 {
   return _CUDA_VSTD::move(__x);
diff --git a/libcudacxx/include/cuda/std/__utility/pair.h b/libcudacxx/include/cuda/std/__utility/pair.h
index 753d5ac5f1..0a1eab5546 100644
--- a/libcudacxx/include/cuda/std/__utility/pair.h
+++ b/libcudacxx/include/cuda/std/__utility/pair.h
@@ -126,7 +126,7 @@ struct __pair_base
 
   template <class _Constraints                                                 = __pair_constraints<_T1, _T2>,
             __enable_if_t<_Constraints::__explicit_default_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __pair_base() noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __pair_base() noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _T1) && _CCCL_TRAIT(is_nothrow_default_constructible, _T2))
       : first()
       , second()
@@ -134,14 +134,14 @@ struct __pair_base
 
   template <class _Constraints                                                 = __pair_constraints<_T1, _T2>,
             __enable_if_t<_Constraints::__implicit_default_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __pair_base() noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base() noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _T1) && _CCCL_TRAIT(is_nothrow_default_constructible, _T2))
       : first()
       , second()
   {}
 
   template <class _U1, class _U2>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2))
       : first(_CUDA_VSTD::forward<_U1>(__t1))
       , second(_CUDA_VSTD::forward<_U2>(__t2))
@@ -149,7 +149,7 @@ struct __pair_base
 
 protected:
   template <class... _Args1, class... _Args2, size_t... _I1, size_t... _I2>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __pair_base(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __pair_base(
     piecewise_construct_t,
     tuple<_Args1...>& __first_args,
     tuple<_Args2...>& __second_args,
@@ -165,7 +165,7 @@ struct __pair_base<_T1, _T2, true>
 
   template <class _Constraints                                                 = __pair_constraints<_T1, _T2>,
             __enable_if_t<_Constraints::__explicit_default_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __pair_base() noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __pair_base() noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _T1) && _CCCL_TRAIT(is_nothrow_default_constructible, _T2))
       : first()
       , second()
@@ -173,17 +173,17 @@ struct __pair_base<_T1, _T2, true>
 
   template <class _Constraints                                                 = __pair_constraints<_T1, _T2>,
             __enable_if_t<_Constraints::__implicit_default_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __pair_base() noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base() noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _T1) && _CCCL_TRAIT(is_nothrow_default_constructible, _T2))
       : first()
       , second()
   {}
 
-  constexpr __pair_base(const __pair_base&) = default;
-  constexpr __pair_base(__pair_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI constexpr __pair_base(const __pair_base&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr __pair_base(__pair_base&&)      = default;
 
   // We need to ensure that a reference type, which would inhibit the implicit copy assignment still works
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __pair_base& operator=(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __pair_base& operator=(
     __conditional_t<_CCCL_TRAIT(is_copy_assignable, _T1) && _CCCL_TRAIT(is_copy_assignable, _T2),
                     __pair_base,
                     __nat> const& __p) noexcept(_CCCL_TRAIT(is_nothrow_copy_assignable, _T1)
@@ -195,7 +195,7 @@ struct __pair_base<_T1, _T2, true>
   }
 
   // We need to ensure that a reference type, which would inhibit the implicit move assignment still works
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __pair_base& operator=(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __pair_base& operator=(
     __conditional_t<_CCCL_TRAIT(is_move_assignable, _T1) && _CCCL_TRAIT(is_move_assignable, _T2), __pair_base, __nat>&&
       __p) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _T1) && _CCCL_TRAIT(is_nothrow_move_assignable, _T2))
   {
@@ -205,7 +205,7 @@ struct __pair_base<_T1, _T2, true>
   }
 
   template <class _U1, class _U2>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2))
       : first(_CUDA_VSTD::forward<_U1>(__t1))
       , second(_CUDA_VSTD::forward<_U2>(__t2))
@@ -213,7 +213,7 @@ struct __pair_base<_T1, _T2, true>
 
 protected:
   template <class... _Args1, class... _Args2, size_t... _I1, size_t... _I2>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __pair_base(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __pair_base(
     piecewise_construct_t,
     tuple<_Args1...>& __first_args,
     tuple<_Args2...>& __second_args,
@@ -222,7 +222,7 @@ struct __pair_base<_T1, _T2, true>
 };
 
 template <class _T1, class _T2>
-struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT pair : public __pair_base<_T1, _T2>
 {
   using __base = __pair_base<_T1, _T2>;
 
@@ -231,14 +231,14 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 
   template <class _Constraints                                                 = __pair_constraints<_T1, _T2>,
             __enable_if_t<_Constraints::__explicit_default_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr pair() noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr pair() noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _T1) && _CCCL_TRAIT(is_nothrow_default_constructible, _T2))
       : __base()
   {}
 
   template <class _Constraints                                                 = __pair_constraints<_T1, _T2>,
             __enable_if_t<_Constraints::__implicit_default_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr pair() noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pair() noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _T1) && _CCCL_TRAIT(is_nothrow_default_constructible, _T2))
       : __base()
   {}
@@ -246,14 +246,14 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
   // element wise constructors
   template <class _Constraints                                                       = __pair_constraints<_T1, _T2>,
             __enable_if_t<_Constraints::__explicit_constructible_from_elements, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr pair(const _T1& __t1, const _T2& __t2) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr pair(const _T1& __t1, const _T2& __t2) noexcept(
     _CCCL_TRAIT(is_nothrow_copy_constructible, _T1) && _CCCL_TRAIT(is_nothrow_copy_constructible, _T2))
       : __base(__t1, __t2)
   {}
 
   template <class _Constraints                                                       = __pair_constraints<_T1, _T2>,
             __enable_if_t<_Constraints::__implicit_constructible_from_elements, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr pair(const _T1& __t1, const _T2& __t2) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pair(const _T1& __t1, const _T2& __t2) noexcept(
     _CCCL_TRAIT(is_nothrow_copy_constructible, _T1) && _CCCL_TRAIT(is_nothrow_copy_constructible, _T2))
       : __base(__t1, __t2)
   {}
@@ -265,7 +265,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
             class _U2                                                  = _T2,
             class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr pair(_U1&& __u1, _U2&& __u2) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr pair(_U1&& __u1, _U2&& __u2) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2))
       : __base(_CUDA_VSTD::forward<_U1>(__u1), _CUDA_VSTD::forward<_U2>(__u2))
   {}
@@ -274,13 +274,13 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
             class _U2                                                  = _T2,
             class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr pair(_U1&& __u1, _U2&& __u2) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pair(_U1&& __u1, _U2&& __u2) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2))
       : __base(_CUDA_VSTD::forward<_U1>(__u1), _CUDA_VSTD::forward<_U2>(__u2))
   {}
 
   template <class... _Args1, class... _Args2>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20
   pair(piecewise_construct_t __pc, tuple<_Args1...> __first_args, tuple<_Args2...> __second_args) noexcept(
     (is_nothrow_constructible<_T1, _Args1...>::value && is_nothrow_constructible<_T2, _Args2...>::value))
       : __base(__pc,
@@ -291,14 +291,14 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
   {}
 
   // copy and move constructors
-  pair(pair const&) = default;
-  pair(pair&&)      = default;
+  _CCCL_HIDE_FROM_ABI pair(pair const&) = default;
+  _CCCL_HIDE_FROM_ABI pair(pair&&)      = default;
 
   template <class _U1                                                  = _T1,
             class _U2                                                  = _T2,
             class _Constraints                                         = __pair_constructible<const _U1&, const _U2&>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit _CCCL_CONSTEXPR_CXX14 pair(const pair<_U1, _U2>& __p) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit _CCCL_CONSTEXPR_CXX14 pair(const pair<_U1, _U2>& __p) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, const _U1&) && _CCCL_TRAIT(is_nothrow_constructible, _T2, const _U2&))
       : __base(__p.first, __p.second)
   {}
@@ -307,7 +307,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
             class _U2                                                  = _T2,
             class _Constraints                                         = __pair_constructible<const _U1&, const _U2&>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair(const pair<_U1, _U2>& __p) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair(const pair<_U1, _U2>& __p) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, const _U1&) && _CCCL_TRAIT(is_nothrow_constructible, _T2, const _U2&))
       : __base(__p.first, __p.second)
   {}
@@ -317,7 +317,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
             class _U2                                                  = _T2,
             class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit _CCCL_CONSTEXPR_CXX14 pair(pair<_U1, _U2>&& __p) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit _CCCL_CONSTEXPR_CXX14 pair(pair<_U1, _U2>&& __p) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2))
       : __base(_CUDA_VSTD::forward<_U1>(__p.first), _CUDA_VSTD::forward<_U2>(__p.second))
   {}
@@ -326,7 +326,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
             class _U2                                                  = _T2,
             class _Constraints                                         = __pair_constructible<_U1, _U2>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair(pair<_U1, _U2>&& __p) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair(pair<_U1, _U2>&& __p) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2))
       : __base(_CUDA_VSTD::forward<_U1>(__p.first), _CUDA_VSTD::forward<_U2>(__p.second))
   {}
@@ -371,14 +371,14 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 #endif // !defined(_CCCL_COMPILER_NVRTC)
 
   // assignments
-  pair& operator=(const pair&) = default;
-  pair& operator=(pair&&)      = default;
+  _CCCL_HIDE_FROM_ABI pair& operator=(const pair&) = default;
+  _CCCL_HIDE_FROM_ABI pair& operator=(pair&&)      = default;
 
   template <class _U1,
             class _U2,
             class _Constraints = typename __pair_constraints<_T1, _T2>::template __assignable<const _U1&, const _U2&>,
             __enable_if_t<_Constraints::__enable_assign, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair& operator=(const pair<_U1, _U2>& __p) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair& operator=(const pair<_U1, _U2>& __p) noexcept(
     _CCCL_TRAIT(is_nothrow_assignable, _T1, const _U1&) && _CCCL_TRAIT(is_nothrow_assignable, _T2, const _U2&))
   {
     this->first  = __p.first;
@@ -390,7 +390,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
             class _U2,
             class _Constraints = typename __pair_constraints<_T1, _T2>::template __assignable<_U1, _U2>,
             __enable_if_t<_Constraints::__enable_assign, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 pair& operator=(pair<_U1, _U2>&& __p) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair& operator=(pair<_U1, _U2>&& __p) noexcept(
     _CCCL_TRAIT(is_nothrow_assignable, _T1, _U1) && _CCCL_TRAIT(is_nothrow_assignable, _T2, _U2))
   {
     this->first  = _CUDA_VSTD::forward<_U1>(__p.first);
@@ -420,7 +420,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 #endif // !defined(_CCCL_COMPILER_NVRTC)
 
 #if _CCCL_STD_VER >= 2023
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const pair& operator=(pair const& __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const pair& operator=(pair const& __p) const
     noexcept(_CCCL_TRAIT(is_nothrow_copy_assignable, const _T1) && _CCCL_TRAIT(is_nothrow_copy_assignable, const _T2))
     requires(is_copy_assignable_v<const _T1> && is_copy_assignable_v<const _T2>)
   {
@@ -440,7 +440,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
   }
 #  endif // !defined(_CCCL_COMPILER_NVRTC)
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const pair& operator=(pair&& __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const pair& operator=(pair&& __p) const
     noexcept(_CCCL_TRAIT(is_nothrow_assignable, const _T1&, _T1) && _CCCL_TRAIT(is_nothrow_assignable, const _T2&, _T2))
     requires(is_assignable_v<const _T1&, _T1> && is_assignable_v<const _T2&, _T2>)
   {
@@ -461,8 +461,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 #  endif // !defined(_CCCL_COMPILER_NVRTC)
 
   template <class _U1, class _U2>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const pair&
-  operator=(const pair<_U1, _U2>& __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const pair& operator=(const pair<_U1, _U2>& __p) const
     requires(is_assignable_v<const _T1&, const _U1&> && is_assignable_v<const _T2&, const _U2&>)
   {
     this->first  = __p.first;
@@ -482,7 +481,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 #  endif // !defined(_CCCL_COMPILER_NVRTC)
 
   template <class _U1, class _U2>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const pair& operator=(pair<_U1, _U2>&& __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const pair& operator=(pair<_U1, _U2>&& __p) const
     requires(is_assignable_v<const _T1&, _U1> && is_assignable_v<const _T2&, _U2>)
   {
     this->first  = _CUDA_VSTD::forward<_U1>(__p.first);
@@ -502,7 +501,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS pair : public __pair_base<_T1, _T2>
 #  endif // !defined(_CCCL_COMPILER_NVRTC)
 #endif // _CCCL_STD_VER >= 2023
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void
   swap(pair& __p) noexcept(__is_nothrow_swappable<_T1>::value && __is_nothrow_swappable<_T2>::value)
   {
     using _CUDA_VSTD::swap;
@@ -536,8 +535,7 @@ _CCCL_HOST_DEVICE pair(_T1, _T2) -> pair<_T1, _T2>;
 // [pairs.spec], specialized algorithms
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return __x.first == __y.first && __x.second == __y.second;
 }
@@ -545,9 +543,8 @@ operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 #ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
 template <class _T1, class _T2>
-_LIBCUDACXX_HIDE_FROM_ABI
-_LIBCUDACXX_INLINE_VISIBILITY constexpr common_comparison_category_t<__synth_three_way_result<_T1>,
-                                                                     __synth_three_way_result<_T2>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr common_comparison_category_t<__synth_three_way_result<_T1>,
+                                                                 __synth_three_way_result<_T2>>
 operator<=>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   if (auto __c = _CUDA_VSTD::__synth_three_way(__x.first, __y.first); __c != 0)
@@ -560,36 +557,31 @@ operator<=>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 #else // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator!=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return !(__x == __y);
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator<(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return __x.first < __y.first || (!(__y.first < __x.first) && __x.second < __y.second);
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return __y < __x;
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator>=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return !(__x < __y);
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator<=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return !(__y < __x);
 }
@@ -615,7 +607,7 @@ struct common_type<pair<_T1, _T2>, pair<_U1, _U2>>
 #endif // _CCCL_STD_VER >= 2023
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX20 __enable_if_t<__is_swappable<_T1>::value && __is_swappable<_T2>::value, void>
 swap(pair<_T1, _T2>& __x,
      pair<_T1, _T2>& __y) noexcept((__is_nothrow_swappable<_T1>::value && __is_nothrow_swappable<_T2>::value))
@@ -626,7 +618,7 @@ swap(pair<_T1, _T2>& __x,
 #if _CCCL_STD_VER >= 2023
 template <class _T1, class _T2>
   requires(__is_swappable<const _T1>::value && __is_swappable<const _T2>::value)
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr void
+_LIBCUDACXX_HIDE_FROM_ABI constexpr void
 swap(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) noexcept(noexcept(__x.swap(__y)))
 {
   __x.swap(__y);
@@ -634,7 +626,7 @@ swap(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) noexcept(noexcept(__x
 #endif // _CCCL_STD_VER >= 2023
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX14 pair<typename __unwrap_ref_decay<_T1>::type, typename __unwrap_ref_decay<_T2>::type>
 make_pair(_T1&& __t1, _T2&& __t2)
 {
@@ -643,23 +635,23 @@ make_pair(_T1&& __t1, _T2&& __t2)
 }
 
 template <class _T1, class _T2>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_size<pair<_T1, _T2>> : public integral_constant<size_t, 2>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_size<pair<_T1, _T2>> : public integral_constant<size_t, 2>
 {};
 
 template <size_t _Ip, class _T1, class _T2>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element<_Ip, pair<_T1, _T2>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, pair<_T1, _T2>>
 {
   static_assert(_Ip < 2, "Index out of bounds in std::tuple_element<std::pair<T1, T2>>");
 };
 
 template <class _T1, class _T2>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element<0, pair<_T1, _T2>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<0, pair<_T1, _T2>>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _T1 type;
 };
 
 template <class _T1, class _T2>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element<1, pair<_T1, _T2>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<1, pair<_T1, _T2>>
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE _T2 type;
 };
@@ -671,25 +663,25 @@ template <>
 struct __get_pair<0>
 {
   template <class _T1, class _T2>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _T1& get(pair<_T1, _T2>& __p) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _T1& get(pair<_T1, _T2>& __p) noexcept
   {
     return __p.first;
   }
 
   template <class _T1, class _T2>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _T1& get(const pair<_T1, _T2>& __p) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _T1& get(const pair<_T1, _T2>& __p) noexcept
   {
     return __p.first;
   }
 
   template <class _T1, class _T2>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _T1&& get(pair<_T1, _T2>&& __p) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _T1&& get(pair<_T1, _T2>&& __p) noexcept
   {
     return _CUDA_VSTD::forward<_T1>(__p.first);
   }
 
   template <class _T1, class _T2>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _T1&& get(const pair<_T1, _T2>&& __p) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _T1&& get(const pair<_T1, _T2>&& __p) noexcept
   {
     return _CUDA_VSTD::forward<const _T1>(__p.first);
   }
@@ -699,53 +691,52 @@ template <>
 struct __get_pair<1>
 {
   template <class _T1, class _T2>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _T2& get(pair<_T1, _T2>& __p) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _T2& get(pair<_T1, _T2>& __p) noexcept
   {
     return __p.second;
   }
 
   template <class _T1, class _T2>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _T2& get(const pair<_T1, _T2>& __p) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _T2& get(const pair<_T1, _T2>& __p) noexcept
   {
     return __p.second;
   }
 
   template <class _T1, class _T2>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _T2&& get(pair<_T1, _T2>&& __p) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _T2&& get(pair<_T1, _T2>&& __p) noexcept
   {
     return _CUDA_VSTD::forward<_T2>(__p.second);
   }
 
   template <class _T1, class _T2>
-  static _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _T2&& get(const pair<_T1, _T2>&& __p) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _T2&& get(const pair<_T1, _T2>&& __p) noexcept
   {
     return _CUDA_VSTD::forward<const _T2>(__p.second);
   }
 };
 
 template <size_t _Ip, class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, pair<_T1, _T2>>&
-get(pair<_T1, _T2>& __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, pair<_T1, _T2>>& get(pair<_T1, _T2>& __p) noexcept
 {
   return __get_pair<_Ip>::get(__p);
 }
 
 template <size_t _Ip, class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, pair<_T1, _T2>>&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, pair<_T1, _T2>>&
 get(const pair<_T1, _T2>& __p) noexcept
 {
   return __get_pair<_Ip>::get(__p);
 }
 
 template <size_t _Ip, class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, pair<_T1, _T2>>&&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, pair<_T1, _T2>>&&
 get(pair<_T1, _T2>&& __p) noexcept
 {
   return __get_pair<_Ip>::get(_CUDA_VSTD::move(__p));
 }
 
 template <size_t _Ip, class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, pair<_T1, _T2>>&&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, pair<_T1, _T2>>&&
 get(const pair<_T1, _T2>&& __p) noexcept
 {
   return __get_pair<_Ip>::get(_CUDA_VSTD::move(__p));
@@ -753,49 +744,49 @@ get(const pair<_T1, _T2>&& __p) noexcept
 
 #if _CCCL_STD_VER >= 2014
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1& get(pair<_T1, _T2>& __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1& get(pair<_T1, _T2>& __p) noexcept
 {
   return __get_pair<0>::get(__p);
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1 const& get(pair<_T1, _T2> const& __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1 const& get(pair<_T1, _T2> const& __p) noexcept
 {
   return __get_pair<0>::get(__p);
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1&& get(pair<_T1, _T2>&& __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1&& get(pair<_T1, _T2>&& __p) noexcept
 {
   return __get_pair<0>::get(_CUDA_VSTD::move(__p));
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1 const&& get(pair<_T1, _T2> const&& __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1 const&& get(pair<_T1, _T2> const&& __p) noexcept
 {
   return __get_pair<0>::get(_CUDA_VSTD::move(__p));
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1& get(pair<_T2, _T1>& __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1& get(pair<_T2, _T1>& __p) noexcept
 {
   return __get_pair<1>::get(__p);
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1 const& get(pair<_T2, _T1> const& __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1 const& get(pair<_T2, _T1> const& __p) noexcept
 {
   return __get_pair<1>::get(__p);
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1&& get(pair<_T2, _T1>&& __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1&& get(pair<_T2, _T1>&& __p) noexcept
 {
   return __get_pair<1>::get(_CUDA_VSTD::move(__p));
 }
 
 template <class _T1, class _T2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1 const&& get(pair<_T2, _T1> const&& __p) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1 const&& get(pair<_T2, _T1> const&& __p) noexcept
 {
   return __get_pair<1>::get(_CUDA_VSTD::move(__p));
 }
diff --git a/libcudacxx/include/cuda/std/__utility/piecewise_construct.h b/libcudacxx/include/cuda/std/__utility/piecewise_construct.h
index ea2afbd820..abc9aee89b 100644
--- a/libcudacxx/include/cuda/std/__utility/piecewise_construct.h
+++ b/libcudacxx/include/cuda/std/__utility/piecewise_construct.h
@@ -22,15 +22,11 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-struct _LIBCUDACXX_TEMPLATE_VIS piecewise_construct_t
+struct _CCCL_TYPE_VISIBILITY_DEFAULT piecewise_construct_t
 {
-  explicit piecewise_construct_t() = default;
+  _CCCL_HIDE_FROM_ABI explicit piecewise_construct_t() = default;
 };
-#if defined(_LIBCUDACXX_BUILDING_LIBRARY)
-extern _LIBCUDACXX_EXPORTED_FROM_ABI const piecewise_construct_t piecewise_construct; // = piecewise_construct_t();
-#else
-/* _LIBCUDACXX_INLINE_VAR */ constexpr piecewise_construct_t piecewise_construct = piecewise_construct_t();
-#endif
+_CCCL_GLOBAL_CONSTANT piecewise_construct_t piecewise_construct = piecewise_construct_t();
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__utility/rel_ops.h b/libcudacxx/include/cuda/std/__utility/rel_ops.h
index 3012e0df6d..406a766ea8 100644
--- a/libcudacxx/include/cuda/std/__utility/rel_ops.h
+++ b/libcudacxx/include/cuda/std/__utility/rel_ops.h
@@ -29,25 +29,25 @@ namespace rel_ops
 {
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(const _Tp& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI bool operator!=(const _Tp& __x, const _Tp& __y)
 {
   return !(__x == __y);
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator>(const _Tp& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI bool operator>(const _Tp& __x, const _Tp& __y)
 {
   return __y < __x;
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator<=(const _Tp& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI bool operator<=(const _Tp& __x, const _Tp& __y)
 {
   return !(__y < __x);
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY bool operator>=(const _Tp& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI bool operator>=(const _Tp& __x, const _Tp& __y)
 {
   return !(__x < __y);
 }
diff --git a/libcudacxx/include/cuda/std/__utility/swap.h b/libcudacxx/include/cuda/std/__utility/swap.h
index db0cf7db7a..8531d68f2a 100644
--- a/libcudacxx/include/cuda/std/__utility/swap.h
+++ b/libcudacxx/include/cuda/std/__utility/swap.h
@@ -32,7 +32,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __swap_result_t<_Tp> swap(_Tp& __x, _Tp& __y) noexcept(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __swap_result_t<_Tp> swap(_Tp& __x, _Tp& __y) noexcept(
   _CCCL_TRAIT(is_nothrow_move_constructible, _Tp) && _CCCL_TRAIT(is_nothrow_move_assignable, _Tp))
 {
   _Tp __t(_CUDA_VSTD::move(__x));
@@ -41,7 +41,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __swap_result_t<_Tp>
 }
 
 template <class _Tp, size_t _Np>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
 __enable_if_t<__detect_adl_swap::__has_no_adl_swap_array<_Tp, _Np>::value && __is_swappable<_Tp>::value>
 swap(_Tp (&__a)[_Np], _Tp (&__b)[_Np]) noexcept(__is_nothrow_swappable<_Tp>::value)
 {
diff --git a/libcudacxx/include/cuda/std/__utility/to_underlying.h b/libcudacxx/include/cuda/std/__utility/to_underlying.h
index c4c15d9b98..2c01461c0a 100644
--- a/libcudacxx/include/cuda/std/__utility/to_underlying.h
+++ b/libcudacxx/include/cuda/std/__utility/to_underlying.h
@@ -26,14 +26,14 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr typename underlying_type<_Tp>::type __to_underlying(_Tp __val) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr typename underlying_type<_Tp>::type __to_underlying(_Tp __val) noexcept
 {
   return static_cast<typename underlying_type<_Tp>::type>(__val);
 }
 
 #if _CCCL_STD_VER > 2020
 template <class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr underlying_type_t<_Tp> to_underlying(_Tp __val) noexcept
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr underlying_type_t<_Tp> to_underlying(_Tp __val) noexcept
 {
   return _CUDA_VSTD::__to_underlying(__val);
 }
diff --git a/libcudacxx/include/cuda/std/__utility/unreachable.h b/libcudacxx/include/cuda/std/__utility/unreachable.h
index 4f5b62866b..34bf2e45d4 100644
--- a/libcudacxx/include/cuda/std/__utility/unreachable.h
+++ b/libcudacxx/include/cuda/std/__utility/unreachable.h
@@ -20,18 +20,18 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/detail/libcxx/include/cstdlib>
+#include <cuda/std/cstdlib>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_CCCL_NORETURN _LIBCUDACXX_INLINE_VISIBILITY inline void __libcpp_unreachable()
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __libcpp_unreachable()
 {
   _LIBCUDACXX_UNREACHABLE();
 }
 
 #if _CCCL_STD_VER > 2020
 
-[[noreturn]] _LIBCUDACXX_INLINE_VISIBILITY inline void unreachable()
+[[noreturn]] _LIBCUDACXX_HIDE_FROM_ABI void unreachable()
 {
   _LIBCUDACXX_UNREACHABLE();
 }
diff --git a/libcudacxx/include/cuda/std/__variant/monostate.h b/libcudacxx/include/cuda/std/__variant/monostate.h
index 2453e8ae01..c007f41de2 100644
--- a/libcudacxx/include/cuda/std/__variant/monostate.h
+++ b/libcudacxx/include/cuda/std/__variant/monostate.h
@@ -28,71 +28,67 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER >= 2011
-
-struct _LIBCUDACXX_TEMPLATE_VIS monostate
+struct _CCCL_TYPE_VISIBILITY_DEFAULT monostate
 {};
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator==(monostate, monostate) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(monostate, monostate) noexcept
 {
   return true;
 }
 
-#  if _CCCL_STD_VER < 2020
+#if _CCCL_STD_VER < 2020
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator!=(monostate, monostate) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(monostate, monostate) noexcept
 {
   return false;
 }
 
-#  endif // _CCCL_STD_VER < 2020
+#endif // _CCCL_STD_VER < 2020
 
-#  if _CCCL_STD_VER >= 2020 && !defined(_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR)
+#if _CCCL_STD_VER >= 2020 && !defined(_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR)
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr strong_ordering operator<=>(monostate, monostate) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr strong_ordering operator<=>(monostate, monostate) noexcept
 {
   return strong_ordering::equal;
 }
 
-#  else // _CCCL_STD_VER >= 2020
+#else // _CCCL_STD_VER >= 2020
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<(monostate, monostate) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(monostate, monostate) noexcept
 {
   return false;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>(monostate, monostate) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(monostate, monostate) noexcept
 {
   return false;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<=(monostate, monostate) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(monostate, monostate) noexcept
 {
   return true;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>=(monostate, monostate) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(monostate, monostate) noexcept
 {
   return true;
 }
 
-#  endif // _CCCL_STD_VER >= 2020
+#endif // _CCCL_STD_VER >= 2020
 
-#  ifndef __cuda_std__
+#ifndef __cuda_std__
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<monostate>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<monostate>
 {
   using argument_type = monostate;
   using result_type   = size_t;
 
-  inline _LIBCUDACXX_INLINE_VISIBILITY result_type operator()(const argument_type&) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI result_type operator()(const argument_type&) const noexcept
   {
     return 66740831; // return a fundamentally attractive random value.
   }
 };
-#  endif // __cuda_std__
-
-#endif // _CCCL_STD_VER >= 2011
+#endif // __cuda_std__
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index d875451a1e..dad03c6acd 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -55,7 +55,7 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp kill_dependency(_Tp __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp kill_dependency(_Tp __y) noexcept
 {
   return __y;
 }
@@ -66,11 +66,11 @@ struct atomic : public __atomic_impl<_Tp>
 {
   using value_type = _Tp;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr atomic() noexcept
       : __atomic_impl<_Tp>()
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr atomic(_Tp __d) noexcept
       : __atomic_impl<_Tp>(__d)
   {}
 
@@ -78,12 +78,12 @@ struct atomic : public __atomic_impl<_Tp>
   atomic& operator=(const atomic&)          = delete;
   atomic& operator=(const atomic&) volatile = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp operator=(_Tp __d) volatile noexcept
   {
     this->store(__d);
     return __d;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp operator=(_Tp __d) noexcept
   {
     this->store(__d);
     return __d;
@@ -100,31 +100,31 @@ struct atomic_ref : public __atomic_ref_impl<_Tp>
 
   static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp& __ref)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit atomic_ref(_Tp& __ref)
       : __atomic_ref_impl<_Tp>(__ref)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp operator=(_Tp __v) const noexcept
   {
     this->store(__v);
     return __v;
   }
 
-  atomic_ref(const atomic_ref&) noexcept         = default;
-  atomic_ref& operator=(const atomic_ref&)       = delete;
-  atomic_ref& operator=(const atomic_ref&) const = delete;
+  _CCCL_HIDE_FROM_ABI atomic_ref(const atomic_ref&) noexcept = default;
+  atomic_ref& operator=(const atomic_ref&)                   = delete;
+  atomic_ref& operator=(const atomic_ref&) const             = delete;
 };
 
 // atomic_is_lock_free
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
 {
   return __o->is_lock_free();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
 {
   return __o->is_lock_free();
 }
@@ -132,13 +132,13 @@ _LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const atomic<_Tp>* __o) n
 // atomic_init
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
   __atomic_init_dispatch(&__o->__a, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
 {
   __atomic_init_dispatch(&__o->__a, __d);
 }
@@ -146,13 +146,13 @@ _LIBCUDACXX_INLINE_VISIBILITY void atomic_init(atomic<_Tp>* __o, _Tp __d) noexce
 // atomic_store
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
   __o->store(__d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
 {
   __o->store(__d);
 }
@@ -160,14 +160,14 @@ _LIBCUDACXX_INLINE_VISIBILITY void atomic_store(atomic<_Tp>* __o, _Tp __d) noexc
 // atomic_store_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
 {
   __o->store(__d, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
 {
   __o->store(__d, __m);
@@ -176,13 +176,13 @@ _LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(atomic<_Tp>* __o, _Tp _
 // atomic_load
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp atomic_load(const volatile atomic<_Tp>* __o) noexcept
 {
   return __o->load();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp atomic_load(const atomic<_Tp>* __o) noexcept
 {
   return __o->load();
 }
@@ -190,14 +190,14 @@ _LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const atomic<_Tp>* __o) noexcept
 // atomic_load_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
   return __o->load(__m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
   return __o->load(__m);
@@ -206,13 +206,13 @@ _LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const atomic<_Tp>* __o, m
 // atomic_exchange
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
   return __o->exchange(__d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
 {
   return __o->exchange(__d);
 }
@@ -220,13 +220,13 @@ _LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(atomic<_Tp>* __o, _Tp __d) noe
 // atomic_exchange_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
 {
   return __o->exchange(__d, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
 {
   return __o->exchange(__d, __m);
 }
@@ -234,13 +234,13 @@ _LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(atomic<_Tp>* __o, _Tp
 // atomic_compare_exchange_weak
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
   return __o->compare_exchange_weak(*__e, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
   return __o->compare_exchange_weak(*__e, __d);
 }
@@ -248,13 +248,13 @@ _LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(atomic<_Tp>* __o
 // atomic_compare_exchange_strong
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
   return __o->compare_exchange_strong(*__e, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
   return __o->compare_exchange_strong(*__e, __d);
 }
@@ -262,7 +262,7 @@ _LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(atomic<_Tp>* _
 // atomic_compare_exchange_weak_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak_explicit(
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_compare_exchange_weak_explicit(
   volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
@@ -270,7 +270,7 @@ _LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak_explicit(
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool
+_LIBCUDACXX_HIDE_FROM_ABI bool
 atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
@@ -280,7 +280,7 @@ atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d, memor
 // atomic_compare_exchange_strong_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_compare_exchange_strong_explicit(
   volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
@@ -288,7 +288,7 @@ _LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_compare_exchange_strong_explicit(
   atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
@@ -298,14 +298,14 @@ _LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
 // atomic_wait
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 atomic_wait(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
 {
   return __o->wait(__v);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_wait(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_wait(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
 {
   return __o->wait(__v);
 }
@@ -313,7 +313,7 @@ _LIBCUDACXX_INLINE_VISIBILITY void atomic_wait(const atomic<_Tp>* __o, typename
 // atomic_wait_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 atomic_wait_explicit(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
@@ -321,7 +321,7 @@ atomic_wait_explicit(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::valu
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
@@ -331,12 +331,12 @@ atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
 {
   __o->notify_one();
 }
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_notify_one(atomic<_Tp>* __o) noexcept
 {
   __o->notify_one();
 }
@@ -344,12 +344,12 @@ _LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(atomic<_Tp>* __o) noexcept
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
 {
   __o->notify_all();
 }
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_notify_all(atomic<_Tp>* __o) noexcept
 {
   __o->notify_all();
 }
@@ -357,7 +357,7 @@ _LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(atomic<_Tp>* __o) noexcept
 // atomic_fetch_add
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
 atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
@@ -365,7 +365,7 @@ atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
 atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
 {
@@ -373,13 +373,13 @@ atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp* atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
   return __o->fetch_add(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp* atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
   return __o->fetch_add(__op);
 }
@@ -387,7 +387,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t
 // atomic_fetch_add_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
 atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
@@ -395,7 +395,7 @@ atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m)
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
 atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
@@ -403,15 +403,14 @@ atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+_LIBCUDACXX_HIDE_FROM_ABI _Tp*
 atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
   return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp* atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
   return __o->fetch_add(__op, __m);
 }
@@ -419,7 +418,7 @@ atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) n
 // atomic_fetch_sub
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
 atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
@@ -427,7 +426,7 @@ atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
 atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
 {
@@ -435,13 +434,13 @@ atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp* atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
   return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp* atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
   return __o->fetch_sub(__op);
 }
@@ -449,7 +448,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t
 // atomic_fetch_sub_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
 atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
@@ -457,7 +456,7 @@ atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m)
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
 atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
@@ -465,15 +464,14 @@ atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+_LIBCUDACXX_HIDE_FROM_ABI _Tp*
 atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
   return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _Tp* atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
   return __o->fetch_sub(__op, __m);
 }
@@ -481,14 +479,14 @@ atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) n
 // atomic_fetch_and
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
   return __o->fetch_and(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and(atomic<_Tp>* __o, _Tp __op) noexcept
 {
   return __o->fetch_and(__op);
@@ -497,14 +495,14 @@ atomic_fetch_and(atomic<_Tp>* __o, _Tp __op) noexcept
 // atomic_fetch_and_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
   return __o->fetch_and(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
   return __o->fetch_and(__op, __m);
@@ -513,14 +511,14 @@ atomic_fetch_and_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 // atomic_fetch_or
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
   return __o->fetch_or(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or(atomic<_Tp>* __o, _Tp __op) noexcept
 {
   return __o->fetch_or(__op);
@@ -529,14 +527,14 @@ atomic_fetch_or(atomic<_Tp>* __o, _Tp __op) noexcept
 // atomic_fetch_or_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
   return __o->fetch_or(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
   return __o->fetch_or(__op, __m);
@@ -545,14 +543,14 @@ atomic_fetch_or_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 // atomic_fetch_xor
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
   return __o->fetch_xor(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor(atomic<_Tp>* __o, _Tp __op) noexcept
 {
   return __o->fetch_xor(__op);
@@ -561,14 +559,14 @@ atomic_fetch_xor(atomic<_Tp>* __o, _Tp __op) noexcept
 // atomic_fetch_xor_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
   return __o->fetch_xor(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
   return __o->fetch_xor(__op, __m);
@@ -580,62 +578,62 @@ struct atomic_flag
 {
   __atomic_storage_t<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a;
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
   {
     return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI bool test(memory_order __m = memory_order_seq_cst) const noexcept
   {
     return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
   {
     return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
   {
     return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
   {
     __atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void clear(memory_order __m = memory_order_seq_cst) noexcept
   {
     __atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void
-  wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const
+    volatile noexcept
   {
     __atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void
+  _LIBCUDACXX_HIDE_FROM_ABI void
   wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const noexcept
   {
     __atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void notify_one() volatile noexcept
   {
     __atomic_notify_one(&__a, __thread_scope_system_tag{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void notify_one() noexcept
   {
     __atomic_notify_one(&__a, __thread_scope_system_tag{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void notify_all() volatile noexcept
   {
     __atomic_notify_all(&__a, __thread_scope_system_tag{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void notify_all() noexcept
   {
     __atomic_notify_all(&__a, __thread_scope_system_tag{});
   }
 
-  atomic_flag() noexcept = default;
+  _CCCL_HIDE_FROM_ABI atomic_flag() noexcept = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic_flag(bool __b) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr atomic_flag(bool __b) noexcept
       : __a(__b)
   {} // EXTENSION
 
@@ -644,109 +642,105 @@ struct atomic_flag
   atomic_flag& operator=(const atomic_flag&) volatile = delete;
 };
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const volatile atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_flag_test(const volatile atomic_flag* __o) noexcept
 {
   return __o->test();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_flag_test(const atomic_flag* __o) noexcept
 {
   return __o->test();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool
-atomic_flag_test_explicit(const volatile atomic_flag* __o, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_flag_test_explicit(const volatile atomic_flag* __o, memory_order __m) noexcept
 {
   return __o->test(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
 {
   return __o->test(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
 {
   return __o->test_and_set();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_flag_test_and_set(atomic_flag* __o) noexcept
 {
   return __o->test_and_set();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool
-atomic_flag_test_and_set_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_flag_test_and_set_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
 {
   return __o->test_and_set(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
 {
   return __o->test_and_set(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(volatile atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_clear(volatile atomic_flag* __o) noexcept
 {
   __o->clear();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_clear(atomic_flag* __o) noexcept
 {
   __o->clear();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-atomic_flag_clear_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_clear_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
 {
   __o->clear(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
 {
   __o->clear(__m);
 }
 
 #if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
 {
   __o->wait(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
 {
   __o->wait(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) noexcept
 {
   __o->wait(__v, __m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) noexcept
 {
   __o->wait(__v, __m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
 {
   __o->notify_one();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_notify_one(atomic_flag* __o) noexcept
 {
   __o->notify_one();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
 {
   __o->notify_all();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(atomic_flag* __o) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_flag_notify_all(atomic_flag* __o) noexcept
 {
   __o->notify_all();
 }
@@ -755,12 +749,12 @@ inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(atomic_flag* __
 
 // fences
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_thread_fence(memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_thread_fence(memory_order __m) noexcept
 {
   __atomic_thread_fence_dispatch(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_signal_fence(memory_order __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void atomic_signal_fence(memory_order __m) noexcept
 {
   __atomic_signal_fence_dispatch(__m);
 }
diff --git a/libcudacxx/include/cuda/std/bit b/libcudacxx/include/cuda/std/bit
index 9106fa588f..b9fb6c358c 100644
--- a/libcudacxx/include/cuda/std/bit
+++ b/libcudacxx/include/cuda/std/bit
@@ -21,354 +21,15 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__bit/clz.h>
-#include <cuda/std/__bit/ctz.h>
-#include <cuda/std/__bit/popc.h>
-#include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_unsigned_integer.h>
-#include <cuda/std/cstdint>
+#include <cuda/std/__bit/bit_cast.h>
+#include <cuda/std/__bit/countl.h>
+#include <cuda/std/__bit/countr.h>
+#include <cuda/std/__bit/endian.h>
+#include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/__bit/integral.h>
+#include <cuda/std/__bit/popcount.h>
+#include <cuda/std/__bit/rotate.h>
 #include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/limits>
 #include <cuda/std/version>
 
-_CCCL_PUSH_MACROS
-
-#if defined(_CCCL_COMPILER_IBM)
-#  include <cuda/std/detail/libcxx/include/support/ibm/support.h>
-#endif // _CCCL_COMPILER_IBM
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp __rotl(_Tp __t, uint32_t __cnt) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned");
-  using __nlt = numeric_limits<_Tp>;
-
-  return ((__cnt % __nlt::digits) == 0)
-         ? __t
-         : (__t << (__cnt % __nlt::digits)) | (__t >> (__nlt::digits - (__cnt % __nlt::digits)));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp __rotr(_Tp __t, uint32_t __cnt) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned");
-  using __nlt = numeric_limits<_Tp>;
-
-  return ((__cnt % __nlt::digits) == 0)
-         ? __t
-         : (__t >> (__cnt % __nlt::digits)) | (__t << (__nlt::digits - (__cnt % __nlt::digits)));
-}
-
-// Forward decl for recursive use in split word operations
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countr_zero(_Tp __t) noexcept;
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
-__countr_zero_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_ctz(static_cast<uint32_t>(__t));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
-__countr_zero_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_ctz(static_cast<uint64_t>(__t));
-}
-
-template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
-struct __countr_zero_rsh_impl
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __short_circuit(_Tp __t, int __cur, int __count)
-  {
-    // Stops processing early if non-zero
-    return (__cur == numeric_limits<uint64_t>::digits)
-           ? __countr_zero_rsh_impl<_Tp, _St - 1>::__count(__t, __cur + __count)
-           : __cur + __count;
-  }
-
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t, int __count)
-  {
-    return __short_circuit(__t >> numeric_limits<uint64_t>::digits, __countr_zero(static_cast<uint64_t>(__t)), __count);
-  }
-};
-
-template <typename _Tp>
-struct __countr_zero_rsh_impl<_Tp, 1>
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t, int __count)
-  {
-    return __count + __countr_zero(static_cast<uint64_t>(__t));
-  }
-};
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
-__countr_zero_dispatch(_Tp __t) noexcept
-{
-  return __countr_zero_rsh_impl<_Tp>::__count(__t, 0);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countr_zero(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned");
-
-  return __t ? __countr_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
-}
-
-// Forward decl for recursive use in split word operations
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_zero(_Tp __t) noexcept;
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
-__countl_zero_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_clz(static_cast<uint32_t>(__t)) - (numeric_limits<uint32_t>::digits - numeric_limits<_Tp>::digits);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
-__countl_zero_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_clz(static_cast<uint64_t>(__t)) - (numeric_limits<uint64_t>::digits - numeric_limits<_Tp>::digits);
-}
-
-template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
-struct __countl_zero_rotl_impl
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __short_circuit(_Tp __t, int __cur)
-  {
-    // This stops processing early if the current word is not empty
-    return (__cur == numeric_limits<uint64_t>::digits)
-           ? __cur + __countl_zero_rotl_impl<_Tp, _St - 1>::__count(__t)
-           : __cur;
-  }
-
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_iter(_Tp __t)
-  {
-    // After rotating pass result of clz to another step for processing
-    return __short_circuit(__t, __countl_zero(static_cast<uint64_t>(__t)));
-  }
-
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t)
-  {
-    return __countl_iter(__rotl(__t, numeric_limits<uint64_t>::digits));
-  }
-};
-
-template <typename _Tp>
-struct __countl_zero_rotl_impl<_Tp, 1>
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t)
-  {
-    return __countl_zero(static_cast<uint64_t>(__rotl(__t, numeric_limits<uint64_t>::digits)));
-  }
-};
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
-__countl_zero_dispatch(_Tp __t) noexcept
-{
-  return __countl_zero_rotl_impl<_Tp>::__count(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_zero(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned");
-  return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countl_one(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned");
-  return __t != numeric_limits<_Tp>::max() ? __countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr int __countr_one(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned");
-  return __t != numeric_limits<_Tp>::max() ? __countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
-__popcount_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_popc(static_cast<uint32_t>(__t));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
-__popcount_dispatch(_Tp __t) noexcept
-{
-  return __libcpp_popc(static_cast<uint64_t>(__t));
-}
-
-template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
-struct __popcount_rsh_impl
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t)
-  {
-    return __popcount_rsh_impl<_Tp, _St - 1>::__count(__t >> numeric_limits<uint64_t>::digits)
-         + __libcpp_popc(static_cast<uint64_t>(__t));
-  }
-};
-
-template <typename _Tp>
-struct __popcount_rsh_impl<_Tp, 1>
-{
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr int __count(_Tp __t)
-  {
-    return __libcpp_popc(static_cast<uint64_t>(__t));
-  }
-};
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
-__popcount_dispatch(_Tp __t) noexcept
-{
-  return __popcount_rsh_impl<_Tp>::__count(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr int __popcount(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__libcpp_popcount requires unsigned");
-
-  return __popcount_dispatch(__t);
-}
-
-// integral log base 2
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr uint32_t __bit_log2(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
-  return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool __has_single_bit(_Tp __t) noexcept
-{
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned");
-  return __t != 0 && (((__t & (__t - 1)) == 0));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
-{
-  return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
-{
-  return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
-                        + (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
-                >> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-rotl(_Tp __t, uint32_t __cnt) noexcept
-{
-  return __rotl(__t, __cnt);
-}
-
-// rotr
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-rotr(_Tp __t, uint32_t __cnt) noexcept
-{
-  return __rotr(__t, __cnt);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countl_zero(_Tp __t) noexcept
-{
-  return __countl_zero(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countl_one(_Tp __t) noexcept
-{
-  return __countl_one(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countr_zero(_Tp __t) noexcept
-{
-  return __countr_zero(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countr_one(_Tp __t) noexcept
-{
-  return __countr_one(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-popcount(_Tp __t) noexcept
-{
-  return __popcount(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, bool>
-has_single_bit(_Tp __t) noexcept
-{
-  return __has_single_bit(__t);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-bit_floor(_Tp __t) noexcept
-{
-  return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-bit_ceil(_Tp __t) noexcept
-{
-  return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-bit_width(_Tp __t) noexcept
-{
-  return __t == 0 ? 0 : static_cast<_Tp>(__bit_log2(__t) + 1);
-}
-
-enum class endian
-{
-  little = 0xDEAD,
-  big    = 0xFACE,
-#if defined(_LIBCUDACXX_LITTLE_ENDIAN)
-  native = little
-#elif defined(_LIBCUDACXX_BIG_ENDIAN)
-  native = big
-#else
-  native = 0xCAFE
-#endif
-};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-_CCCL_POP_MACROS
-
 #endif // _CUDA_STD_BIT
diff --git a/libcudacxx/include/cuda/std/bitset b/libcudacxx/include/cuda/std/bitset
index 78cd67857a..025ddcdfe6 100644
--- a/libcudacxx/include/cuda/std/bitset
+++ b/libcudacxx/include/cuda/std/bitset
@@ -58,7 +58,7 @@ struct __avoid_promotions
                                  _Int,
                                  __conditional_t<is_unsigned<_Int>::value, unsigned int, signed int>>;
 
-  constexpr __avoid_promotions() = default;
+  _CCCL_HIDE_FROM_ABI constexpr __avoid_promotions() = default;
 
   template <class _Tp, typename = __enable_if_t<_CCCL_TRAIT(is_integral, _Tp)>>
   _CCCL_HOST_DEVICE constexpr __avoid_promotions(_Tp __i)
@@ -209,8 +209,7 @@ protected:
   typedef __bit_iterator<__bitset, false> iterator;
   typedef __bit_iterator<__bitset, true> const_iterator;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY static constexpr __storage_type
-  __clip_top_word_to_size(unsigned long long __v)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr __storage_type __clip_top_word_to_size(unsigned long long __v)
   {
     return _Size >= 2 * __bits_per_word
            ? static_cast<__storage_type>(__v >> __bits_per_word)
@@ -218,37 +217,32 @@ protected:
                (__v >> __bits_per_word) & ((__storage_type(1) << (_Size - __bits_per_word)) - 1));
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __bitset() noexcept
       : __first_{0}
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __bitset(unsigned long long __v) noexcept
       : __first_{static_cast<__storage_type>(__v), __clip_top_word_to_size(__v)}
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
-  __make_ref(size_t __pos) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference __make_ref(size_t __pos) noexcept
   {
     return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference
-  __make_ref(size_t __pos) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference __make_ref(size_t __pos) const noexcept
   {
     return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator
-  __make_iter(size_t __pos) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator __make_iter(size_t __pos) noexcept
   {
     return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator
-  __make_iter(size_t __pos) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator __make_iter(size_t __pos) const noexcept
   {
     return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-  operator&=(const __bitset& __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator&=(const __bitset& __v) noexcept
   {
     for (size_type __i = 0; __i < _N_words; ++__i)
     {
@@ -256,8 +250,7 @@ protected:
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-  operator|=(const __bitset& __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator|=(const __bitset& __v) noexcept
   {
     for (size_type __i = 0; __i < _N_words; ++__i)
     {
@@ -265,8 +258,7 @@ protected:
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-  operator^=(const __bitset& __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator^=(const __bitset& __v) noexcept
   {
     for (size_type __i = 0; __i < _N_words; ++__i)
     {
@@ -274,8 +266,7 @@ protected:
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bitset&
-  operator<<=(size_t __pos) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bitset& operator<<=(size_t __pos) noexcept
   {
     __pos = _CUDA_VSTD::min(__pos, _Size);
     _CUDA_VSTD::copy_backward(__make_iter(0), __make_iter(_Size - __pos), __make_iter(_Size));
@@ -283,8 +274,7 @@ protected:
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __bitset&
-  operator>>=(size_t __pos) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bitset& operator>>=(size_t __pos) noexcept
   {
     __pos = _CUDA_VSTD::min(__pos, _Size);
     _CUDA_VSTD::copy(__make_iter(__pos), __make_iter(_Size), __make_iter(0));
@@ -292,7 +282,7 @@ protected:
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void flip() noexcept
   {
     // do middle whole words
     size_type __n         = _Size;
@@ -311,16 +301,16 @@ protected:
     }
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
   {
     return to_ulong(integral_constant<bool, _Size <= sizeof(unsigned long) * CHAR_BIT>());
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
   {
     return to_ullong(integral_constant<bool, _Size <= sizeof(unsigned long long) * CHAR_BIT>());
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
   {
     // do middle whole words
     size_type __n               = _Size;
@@ -344,7 +334,7 @@ protected:
     return true;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
   {
     // do middle whole words
     size_type __n               = _Size;
@@ -368,7 +358,7 @@ protected:
     return false;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t __hash_code() const noexcept
   {
     size_t __h = 0;
     for (size_type __i = 0; __i < _N_words; ++__i)
@@ -379,7 +369,7 @@ protected:
   }
 
 private:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(false_type) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(false_type) const
   {
     const_iterator __e = __make_iter(_Size);
     const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true);
@@ -391,19 +381,17 @@ private:
     return to_ulong(true_type());
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(true_type) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(true_type) const
   {
     return to_ulong(true_type(), integral_constant<bool, sizeof(__storage_type) <= sizeof(unsigned long)>());
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long
-  to_ulong(true_type, false_type) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(true_type, false_type) const
   {
     return __first_[0].__data;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long
-  to_ulong(true_type, true_type) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong(true_type, true_type) const
   {
     unsigned long __r = __first_[0].__data;
     for (size_t __i = 1; __i < sizeof(unsigned long) / sizeof(__storage_type); ++__i)
@@ -413,8 +401,7 @@ private:
     return __r;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long
-  to_ullong(false_type) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong(false_type) const
   {
     const_iterator __e = __make_iter(_Size);
     const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true);
@@ -426,20 +413,17 @@ private:
     return to_ullong(true_type());
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long
-  to_ullong(true_type) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong(true_type) const
   {
     return to_ullong(true_type(), integral_constant<bool, sizeof(__storage_type) <= sizeof(unsigned long long)>());
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long
-  to_ullong(true_type, false_type) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong(true_type, false_type) const
   {
     return __first_[0].__data;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long
-  to_ullong(true_type, true_type) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong(true_type, true_type) const
   {
     unsigned long long __r = __first_[0].__data;
     for (size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i)
@@ -478,78 +462,71 @@ protected:
   typedef __bit_iterator<__bitset, false> iterator;
   typedef __bit_iterator<__bitset, true> const_iterator;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __bitset() noexcept
       : __first_(0)
   {}
 
   _CCCL_DIAG_PUSH
   _CCCL_DIAG_SUPPRESS_MSVC(4293) // shift count negative or too big
                                  // MSVC is slightly overeager with diagnosing that here
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __bitset(unsigned long long __v) noexcept
       : __first_(_Size == __bits_per_word
                    ? static_cast<__storage_type>(__v)
                    : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - __storage_type(1)))
   {}
   _CCCL_DIAG_POP
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
-  __make_ref(size_t __pos) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference __make_ref(size_t __pos) noexcept
   {
     return reference(&__first_, __storage_type(1) << __pos);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference
-  __make_ref(size_t __pos) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference __make_ref(size_t __pos) const noexcept
   {
     return const_reference(&__first_, __storage_type(1) << __pos);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator
-  __make_iter(size_t __pos) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator __make_iter(size_t __pos) noexcept
   {
     return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator
-  __make_iter(size_t __pos) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator __make_iter(size_t __pos) const noexcept
   {
     return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-  operator&=(const __bitset& __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator&=(const __bitset& __v) noexcept
   {
     __first_ &= __v.__first_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-  operator|=(const __bitset& __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator|=(const __bitset& __v) noexcept
   {
     __first_ |= __v.__first_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-  operator^=(const __bitset& __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator^=(const __bitset& __v) noexcept
   {
     __first_ ^= __v.__first_;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator<<=(size_t __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator<<=(size_t __rhs) noexcept
   {
     __first_ <<= __rhs;
     __first_ &= ~__storage_type(0) >> (__bits_per_word - _Size);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator>>=(size_t __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator>>=(size_t __rhs) noexcept
   {
     __first_ >>= __rhs;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void flip() noexcept
   {
     __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
     __first_           = ~__first_;
     __first_ &= __m;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
   {
 #ifdef _CCCL_COMPILER_MSVC
     if (static_cast<unsigned long>(__first_.__data) != __first_.__data)
@@ -562,24 +539,24 @@ protected:
 #endif // !MSVC
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
   {
     return __first_.__data;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
   {
     __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
     return !static_cast<bool>(~__first_ & __m);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
   {
     __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
     return static_cast<bool>(__first_ & __m);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t __hash_code() const noexcept
   {
     return __first_;
   }
@@ -610,70 +587,65 @@ protected:
   typedef __bit_iterator<__bitset, false> iterator;
   typedef __bit_iterator<__bitset, true> const_iterator;
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept {}
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __bitset() noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __bitset(unsigned long long) noexcept {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference __make_ref(size_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference __make_ref(size_t) noexcept
   {
     return reference(nullptr, 1);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference __make_ref(size_t) const noexcept
   {
     return const_reference(nullptr, 1);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator __make_iter(size_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator __make_iter(size_t) noexcept
   {
     return iterator(nullptr, 0);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator
-  __make_iter(size_t) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator __make_iter(size_t) const noexcept
   {
     return const_iterator(nullptr, 0);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator&=(const __bitset&) noexcept
-  {}
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator|=(const __bitset&) noexcept
-  {}
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator^=(const __bitset&) noexcept
-  {}
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator<<=(size_t) noexcept {}
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void operator>>=(size_t) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator&=(const __bitset&) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator|=(const __bitset&) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator^=(const __bitset&) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator<<=(size_t) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void operator>>=(size_t) noexcept {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void flip() noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void flip() noexcept {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
   {
     return 0;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
   {
     return 0;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
   {
     return true;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
   {
     return false;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t __hash_code() const noexcept
   {
     return 0;
   }
 };
 
 template <size_t _Size>
-class _LIBCUDACXX_TEMPLATE_VIS bitset;
+class _CCCL_TYPE_VISIBILITY_DEFAULT bitset;
 template <size_t _Size>
 struct hash<bitset<_Size>>;
 
 template <size_t _Size>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
-__throw_if_out_of_range(size_t __pos, const char* __msg)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __throw_if_out_of_range(size_t __pos, const char* __msg)
 {
   if (__pos >= _Size)
   {
@@ -682,14 +654,13 @@ __throw_if_out_of_range(size_t __pos, const char* __msg)
 }
 
 template <>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY void
-__throw_if_out_of_range<0>(size_t __pos, const char* __msg)
+_LIBCUDACXX_HIDE_FROM_ABI void __throw_if_out_of_range<0>(size_t __pos, const char* __msg)
 {
   _CUDA_VSTD::__throw_out_of_range(__msg);
 }
 
 template <size_t _Size>
-class _LIBCUDACXX_TEMPLATE_VIS bitset : private __bitset<_Size == 0 ? 0 : (_Size - 1) / 32 + 1, _Size>
+class _CCCL_TYPE_VISIBILITY_DEFAULT bitset : private __bitset<_Size == 0 ? 0 : (_Size - 1) / 32 + 1, _Size>
 {
 public:
   static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / 32 + 1;
@@ -700,12 +671,12 @@ public:
   typedef typename base::const_reference const_reference;
 
   // 23.3.5.1 constructors:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset() noexcept {}
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset(unsigned long long __v) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bitset() noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bitset(unsigned long long __v) noexcept
       : base(__v)
   {}
   template <class _CharT, class = __enable_if_t<_IsCharLikeType<_CharT>::value>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit bitset(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit bitset(
     const _CharT* __str, size_t __n = static_cast<size_t>(-1), _CharT __zero = _CharT('0'), _CharT __one = _CharT('1'))
   {
     size_t __rlen = _CUDA_VSTD::min(__n, char_traits<_CharT>::length(__str));
@@ -717,7 +688,7 @@ public:
   }
 #if defined(_LIBCUDACXX_HAS_STRING_VIEW)
   template <class _CharT, class _Traits>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit bitset(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit bitset(
     basic_string_view<_CharT, _Traits> __str,
     typename basic_string_view<_CharT, _Traits>::size_type __pos = 0,
     typename basic_string_view<_CharT, _Traits>::size_type __n   = basic_string_view<_CharT, _Traits>::npos,
@@ -735,7 +706,7 @@ public:
 #endif // defined(_LIBCUDACXX_HAS_STRING_VIEW)
 #if defined(_LIBCUDACXX_HAS_STRING)
   template <class _CharT, class _Traits, class _Allocator>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit bitset(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit bitset(
     const basic_string<_CharT, _Traits, _Allocator>& __str,
     typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos = 0,
     typename basic_string<_CharT, _Traits, _Allocator>::size_type __n = basic_string<_CharT, _Traits, _Allocator>::npos,
@@ -753,49 +724,43 @@ public:
 #endif // defined(_LIBCUDACXX_HAS_STRING)
 
   // 23.3.5.2 bitset operations:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
-  operator&=(const bitset& __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& operator&=(const bitset& __rhs) noexcept
   {
     base::operator&=(__rhs);
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
-  operator|=(const bitset& __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& operator|=(const bitset& __rhs) noexcept
   {
     base::operator|=(__rhs);
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
-  operator^=(const bitset& __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& operator^=(const bitset& __rhs) noexcept
   {
     base::operator^=(__rhs);
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
-  operator<<=(size_t __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& operator<<=(size_t __rhs) noexcept
   {
     base::operator<<=(__rhs);
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
-  operator>>=(size_t __rhs) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& operator>>=(size_t __rhs) noexcept
   {
     base::operator>>=(__rhs);
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& set() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& set() noexcept
   {
     _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, true);
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset&
-  set(size_t __pos, bool __val = true)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& set(size_t __pos, bool __val = true)
   {
     _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset set argument out of range");
 
@@ -803,13 +768,13 @@ public:
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& reset() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& reset() noexcept
   {
     _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, false);
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& reset(size_t __pos)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& reset(size_t __pos)
   {
     _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset reset argument out of range");
 
@@ -817,20 +782,20 @@ public:
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset operator~() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset operator~() const noexcept
   {
     bitset __x(*this);
     __x.flip();
     return __x;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& flip() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& flip() noexcept
   {
     base::flip();
     return *this;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset& flip(size_t __pos)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset& flip(size_t __pos)
   {
     _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset flip argument out of range");
 
@@ -841,34 +806,33 @@ public:
 
   // element access:
 #ifdef _LIBCUDACXX_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator[](size_t __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator[](size_t __p) const
   {
     return base::__make_ref(__p);
   }
 #else
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference operator[](size_t __p) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference operator[](size_t __p) const
   {
     return base::__make_ref(__p);
   }
 #endif
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator[](size_t __p)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator[](size_t __p)
   {
     return base::__make_ref(__p);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
   {
     return base::to_ulong();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
   {
     return base::to_ullong();
   }
 
 #if defined(_LIBCUDACXX_HAS_STRING)
   template <class _CharT, class _Traits, class _Allocator>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, _Traits, _Allocator>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, _Traits, _Allocator>
   to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const
   {
     basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero);
@@ -883,82 +847,75 @@ public:
   }
 
   template <class _CharT, class _Traits>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, _Traits, allocator<_CharT>>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, _Traits, allocator<_CharT>>
   to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const
   {
     return to_string<_CharT, _Traits, allocator<_CharT>>(__zero, __one);
   }
 
   template <class _CharT>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>>
   to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const
   {
     return to_string<_CharT, char_traits<_CharT>, allocator<_CharT>>(__zero, __one);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
-  _CCCL_CONSTEXPR_CXX14 basic_string<char, char_traits<char>, allocator<char>>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 basic_string<char, char_traits<char>, allocator<char>>
   to_string(char __zero = '0', char __one = '1') const
   {
     return to_string<char, char_traits<char>, allocator<char>>(__zero, __one);
   }
 #endif // defined(_LIBCUDACXX_HAS_STRING)
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 size_t count() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 size_t count() const noexcept
   {
     return static_cast<size_t>(_CUDA_VSTD::count(base::__make_iter(0), base::__make_iter(_Size), true));
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t size() const noexcept
   {
     return _Size;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-  operator==(const bitset& __rhs) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const bitset& __rhs) const noexcept
   {
     return _CUDA_VSTD::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0));
   }
 
 #if _CCCL_STD_VER <= 2017
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-  operator!=(const bitset& __rhs) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const bitset& __rhs) const noexcept
   {
     return !(*this == __rhs);
   }
 #endif // C++ <= 17
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool test(size_t __pos) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool test(size_t __pos) const
   {
     _CUDA_VSTD::__throw_if_out_of_range<_Size>(__pos, "bitset test argument out of range");
 
     return (*this)[__pos];
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool all() const noexcept
   {
     return base::all();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool any() const noexcept
   {
     return base::any();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool none() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool none() const noexcept
   {
     return !any();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset
-  operator<<(size_t __pos) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset operator<<(size_t __pos) const noexcept
   {
     bitset __r = *this;
     __r <<= __pos;
     return __r;
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset
-  operator>>(size_t __pos) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset operator>>(size_t __pos) const noexcept
   {
     bitset __r = *this;
     __r >>= __pos;
@@ -968,7 +925,7 @@ public:
 private:
 #if defined(_LIBCUDACXX_HAS_STRING_VIEW)
   template <class _CharT, class _Traits>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
   __init_from_string_view(basic_string_view<_CharT, _Traits> __str, _CharT __zero, _CharT __one)
   {
     for (size_t __i = 0; __i < __str.size(); ++__i)
@@ -990,7 +947,7 @@ private:
   }
 #else // ^^ _LIBCUDACXX_HAS_STRING_VIEW ^^ | vv !_LIBCUDACXX_HAS_STRING_VIEW vv
   template <class _CharT, class _Traits = char_traits<_CharT>>
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
   __init_from_cstr(const _CharT* __str, size_t __size, _CharT __zero, _CharT __one)
   {
     for (size_t __i = 0; __i < __size; ++__i)
@@ -1012,7 +969,7 @@ private:
   }
 #endif // !_LIBCUDACXX_HAS_STRING_VIEW
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t __hash_code() const noexcept
   {
     return base::__hash_code();
   }
@@ -1021,7 +978,7 @@ private:
 };
 
 template <size_t _Size>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset<_Size>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset<_Size>
 operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 {
   bitset<_Size> __r = __x;
@@ -1030,7 +987,7 @@ operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 }
 
 template <size_t _Size>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset<_Size>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset<_Size>
 operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 {
   bitset<_Size> __r = __x;
@@ -1039,7 +996,7 @@ operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 }
 
 template <size_t _Size>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bitset<_Size>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bitset<_Size>
 operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 {
   bitset<_Size> __r = __x;
@@ -1048,20 +1005,20 @@ operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 }
 
 template <size_t _Size>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<bitset<_Size>> : public __unary_function<bitset<_Size>, size_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<bitset<_Size>> : public __unary_function<bitset<_Size>, size_t>
 {
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(const bitset<_Size>& __bs) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(const bitset<_Size>& __bs) const noexcept
   {
     return __bs.__hash_code();
   }
 };
 
 template <class _CharT, class _Traits, size_t _Size>
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY basic_istream<_CharT, _Traits>&
+_LIBCUDACXX_HIDE_FROM_ABI basic_istream<_CharT, _Traits>&
 operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x);
 
 template <class _CharT, class _Traits, size_t _Size>
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY basic_ostream<_CharT, _Traits>&
+_LIBCUDACXX_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x);
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/cassert b/libcudacxx/include/cuda/std/cassert
index bb524e3200..fea5e74b8c 100644
--- a/libcudacxx/include/cuda/std/cassert
+++ b/libcudacxx/include/cuda/std/cassert
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,10 +21,10 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
+#ifndef _CCCL_COMPILER_NVRTC
+#  include <cassert>
 
-#include <cuda/std/detail/libcxx/include/cassert>
-
-_CCCL_POP_MACROS
+#  include <assert.h>
+#endif //_CCCL_COMPILER_NVRTC
 
 #endif // _CUDA_STD_CASSERT
diff --git a/libcudacxx/include/cuda/std/cfloat b/libcudacxx/include/cuda/std/cfloat
index ab5b8bebff..34b9d377b4 100644
--- a/libcudacxx/include/cuda/std/cfloat
+++ b/libcudacxx/include/cuda/std/cfloat
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,7 +23,11 @@
 
 _CCCL_PUSH_MACROS
 
-#include <cuda/std/detail/libcxx/include/cfloat>
+#ifndef _CCCL_COMPILER_NVRTC
+#  include <cfloat>
+
+#  include <float.h>
+#endif // _CCCL_COMPILER_NVRTC
 
 _CCCL_POP_MACROS
 
diff --git a/libcudacxx/include/cuda/std/concepts b/libcudacxx/include/cuda/std/concepts
index 0ade919a47..744e722826 100644
--- a/libcudacxx/include/cuda/std/concepts
+++ b/libcudacxx/include/cuda/std/concepts
@@ -20,10 +20,32 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
-
-#include <cuda/std/detail/libcxx/include/concepts>
-
-_CCCL_POP_MACROS
+#include <cuda/std/__concepts/__concept_macros.h>
+#include <cuda/std/__concepts/_One_of.h>
+#include <cuda/std/__concepts/all_of.h>
+#include <cuda/std/__concepts/arithmetic.h>
+#include <cuda/std/__concepts/assignable.h>
+#include <cuda/std/__concepts/boolean_testable.h>
+#include <cuda/std/__concepts/class_or_enum.h>
+#include <cuda/std/__concepts/common_reference_with.h>
+#include <cuda/std/__concepts/common_with.h>
+#include <cuda/std/__concepts/constructible.h>
+#include <cuda/std/__concepts/convertible_to.h>
+#include <cuda/std/__concepts/copyable.h>
+#include <cuda/std/__concepts/derived_from.h>
+#include <cuda/std/__concepts/destructible.h>
+#include <cuda/std/__concepts/different_from.h>
+#include <cuda/std/__concepts/equality_comparable.h>
+#include <cuda/std/__concepts/invocable.h>
+#include <cuda/std/__concepts/movable.h>
+#include <cuda/std/__concepts/predicate.h>
+#include <cuda/std/__concepts/regular.h>
+#include <cuda/std/__concepts/relation.h>
+#include <cuda/std/__concepts/same_as.h>
+#include <cuda/std/__concepts/semiregular.h>
+#include <cuda/std/__concepts/swappable.h>
+#include <cuda/std/__concepts/totally_ordered.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/version>
 
 #endif // _CUDA_STD_CONCEPTS
diff --git a/libcudacxx/include/cuda/std/detail/__annotated_ptr b/libcudacxx/include/cuda/std/detail/__annotated_ptr
index eb84a309f4..2b29905af0 100644
--- a/libcudacxx/include/cuda/std/detail/__annotated_ptr
+++ b/libcudacxx/include/cuda/std/detail/__annotated_ptr
@@ -204,9 +204,9 @@ class __annotated_ptr_base<access_property::shared>
 protected:
   static constexpr std::uint64_t __prop = 0;
 
-  constexpr __annotated_ptr_base() noexcept                                          = default;
-  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
-  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base() noexcept                                          = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
   _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::shared) noexcept {}
   inline _CCCL_DEVICE void* __apply_prop(void* __p) const
   {
@@ -224,9 +224,9 @@ class __annotated_ptr_base<access_property::global>
 protected:
   static constexpr std::uint64_t __prop = __sm_80::__interleave_normal();
 
-  constexpr __annotated_ptr_base() noexcept                                          = default;
-  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
-  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base() noexcept                                          = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
   _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::global) noexcept {}
   inline _CCCL_DEVICE void* __apply_prop(void* __p) const
   {
@@ -244,9 +244,9 @@ class __annotated_ptr_base<access_property::normal>
 protected:
   static constexpr std::uint64_t __prop = __sm_80::__interleave_normal_demote();
 
-  constexpr __annotated_ptr_base() noexcept                                          = default;
-  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
-  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base() noexcept                                          = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
   _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::normal) noexcept {}
   inline _CCCL_DEVICE void* __apply_prop(void* __p) const
   {
@@ -264,9 +264,9 @@ class __annotated_ptr_base<access_property::persisting>
 protected:
   static constexpr std::uint64_t __prop = __sm_80::__interleave_persisting();
 
-  constexpr __annotated_ptr_base() noexcept                                          = default;
-  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
-  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base() noexcept                                          = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
   _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::persisting) noexcept {}
   inline _CCCL_DEVICE void* __apply_prop(void* __p) const
   {
@@ -284,9 +284,9 @@ class __annotated_ptr_base<access_property::streaming>
 protected:
   static constexpr std::uint64_t __prop = __sm_80::__interleave_streaming();
 
-  constexpr __annotated_ptr_base() noexcept                                          = default;
-  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
-  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base() noexcept                                          = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
   _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::streaming) noexcept {}
   inline _CCCL_DEVICE void* __apply_prop(void* __p) const
   {
@@ -313,8 +313,8 @@ protected:
   _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property __property) noexcept
       : __annotated_ptr_base(static_cast<std::uint64_t>(__property))
   {}
-  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
-  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
   inline _CCCL_DEVICE void* __apply_prop(void* __p) const
   {
     return __associate(__p, __prop);
diff --git a/libcudacxx/include/cuda/std/detail/__config b/libcudacxx/include/cuda/std/detail/__config
index aaa22d7cf6..5ba1b1d444 100644
--- a/libcudacxx/include/cuda/std/detail/__config
+++ b/libcudacxx/include/cuda/std/detail/__config
@@ -11,7 +11,7 @@
 #ifndef __cuda_std__
 #define __cuda_std__
 
-#include <cuda/std/__cccl/version.h>
+#include <cuda/std/__cccl/version.h> // IWYU pragma: export
 
 #define _LIBCUDACXX_CUDA_API_VERSION       CCCL_VERSION
 #define _LIBCUDACXX_CUDA_API_VERSION_MAJOR CCCL_MAJOR_VERSION
@@ -36,6 +36,6 @@
 #  endif
 #endif
 
-#include <cuda/std/detail/libcxx/include/__config>
+#include <cuda/std/detail/libcxx/include/__config> // IWYU pragma: export
 
 #endif //__cuda_std__
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__availability b/libcudacxx/include/cuda/std/detail/libcxx/include/__availability
index f89d2abf1a..73afb40195 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__availability
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__availability
@@ -290,10 +290,10 @@
 
 #endif
 
-// Define availability attributes that depend on _LIBCUDACXX_NO_EXCEPTIONS.
+// Define availability attributes that depend on _CCCL_NO_EXCEPTIONS.
 // Those are defined in terms of the availability attributes above, and
 // should not be vendor-specific.
-#if defined(_LIBCUDACXX_NO_EXCEPTIONS)
+#if defined(_CCCL_NO_EXCEPTIONS)
 #  define _LIBCUDACXX_AVAILABILITY_FUTURE
 #  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_ANY_CAST
 #  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index fc823c46fd..6a52ee5102 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -11,7 +11,7 @@
 #ifndef _LIBCUDACXX_CONFIG
 #define _LIBCUDACXX_CONFIG
 
-#include <cuda/__cccl_config>
+#include <cuda/__cccl_config> // IWYU pragma: export
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -21,9 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if defined(_CCCL_COMPILER_ICC_LLVM)
-#  define _LIBCUDACXX_COMPILER_ICC_LLVM
-#elif defined(_CCCL_COMPILER_ICC)
+#if defined(_CCCL_COMPILER_ICC)
 #  define _LIBCUDACXX_COMPILER_ICC
 #elif defined(_CCCL_COMPILER_NVHPC)
 #  define _LIBCUDACXX_COMPILER_NVHPC
@@ -36,8 +34,6 @@
 #  define _LIBCUDACXX_COMPILER_GCC
 #elif defined(_CCCL_COMPILER_MSVC)
 #  define _LIBCUDACXX_COMPILER_MSVC
-#elif defined(_CCCL_COMPILER_IBM)
-#  define _LIBCUDACXX_COMPILER_IBM
 #elif defined(_CCCL_COMPILER_NVRTC)
 #  define _LIBCUDACXX_COMPILER_NVRTC
 #endif
@@ -93,18 +89,6 @@ extern "C++" {
 #    endif // !__ELF__
 #  endif
 
-#  if defined(__ELF__)
-#    define _LIBCUDACXX_OBJECT_FORMAT_ELF 1
-#  elif defined(__MACH__)
-#    define _LIBCUDACXX_OBJECT_FORMAT_MACHO 1
-#  elif defined(_WIN32)
-#    define _LIBCUDACXX_OBJECT_FORMAT_COFF 1
-#  elif defined(__wasm__)
-#    define _LIBCUDACXX_OBJECT_FORMAT_WASM 1
-#  else
-#    error Unknown object file format
-#  endif
-
 #  if defined(_LIBCUDACXX_ABI_UNSTABLE) || _LIBCUDACXX_ABI_VERSION >= 2 || defined(__cuda_std__)
 // Change short string representation so that string data starts at offset 0,
 // improving its alignment in some cases.
@@ -141,15 +125,6 @@ extern "C++" {
 #    define _LIBCUDACXX_ABI_OPTIMIZED_FUNCTION
 // All the regex constants must be distinct and nonzero.
 #    define _LIBCUDACXX_ABI_REGEX_CONSTANTS_NONZERO
-#  elif _LIBCUDACXX_ABI_VERSION == 1
-#    if !defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
-// Enable compiling copies of now inline methods into the dylib to support
-// applications compiled against older libraries. This is unnecessary with
-// COFF dllexport semantics, since dllexport forces a non-inline definition
-// of inline functions to be emitted anyway. Our own non-inline copy would
-// conflict with the dllexport-emitted copy, so we disable it.
-#      define _LIBCUDACXX_DEPRECATED_ABI_LEGACY_LIBRARY_DEFINITIONS_FOR_INLINE_FUNCTIONS
-#    endif
 #  endif
 
 #  ifndef __has_attribute
@@ -416,12 +391,6 @@ extern "C++" {
 #    define _LIBCUDACXX_PRAGMA_UNROLL(_N)
 #  endif // !__CUDA_ARCH__
 
-#  if defined(_CCCL_COMPILER_MSVC)
-#    define _LIBCUDACXX_ALWAYS_INLINE __forceinline
-#  else
-#    define _LIBCUDACXX_ALWAYS_INLINE __attribute__((__always_inline__))
-#  endif // !_CCCL_COMPILER_MSVC
-
 #  if defined(_CCCL_CUDA_COMPILER)
 #    define _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(size, ptr) (size <= 8)
 #  elif defined(_CCCL_COMPILER_CLANG) || defined(_CCCL_COMPILER_GCC)
@@ -447,7 +416,10 @@ extern "C++" {
 #    define _LIBCUDACXX_ADDRESSOF(...) __builtin_addressof(__VA_ARGS__)
 #  endif // __check_builtin(builtin_addressof)
 
-#  if __check_builtin(builtin_bit_cast) || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER > 1925)
+// MSVC supports __builtin_bit_cast from 19.25 on
+// clang-9 supports __builtin_bit_cast but it is not a constant expression
+#  if (__check_builtin(builtin_bit_cast) || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER > 1925)) \
+    && !defined(_CCCL_CUDACC_BELOW_11_7) && !(defined(_CCCL_COMPILER_CLANG) && __clang_major__ < 10)
 #    define _LIBCUDACXX_BIT_CAST(...) __builtin_bit_cast(__VA_ARGS__)
 #  endif // __check_builtin(builtin_bit_cast)
 
@@ -466,6 +438,11 @@ extern "C++" {
 #    define _LIBCUDACXX_LAUNDER(...) __builtin_launder(__VA_ARGS__)
 #  endif // __check_builtin(builtin_launder)
 
+// Seems that launder is broken with old clang-9 and nvcc-11.1
+#  if (defined(_CCCL_COMPILER_CLANG) && __clang_major__ < 10) && defined(_CCCL_CUDACC_BELOW_11_3)
+#    undef _LIBCUDACXX_LAUNDER
+#  endif // nvcc < 11.3 && clang-9
+
 // Disabled due to libstdc++ conflict
 #  if 0 // __check_builtin(decay)
 #    define _LIBCUDACXX_DECAY(...) __decay(__VA_ARGS__)
@@ -873,23 +850,10 @@ typedef __char32_t char32_t;
 #      define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
 #    endif
 
-#    define _LIBCUDACXX_WEAK
-
 #    define _LIBCUDACXX_HAS_NO_VECTOR_EXTENSION
 
 #    define _LIBCUDACXX_DISABLE_EXTENSION_WARNING
 
-#  elif defined(_CCCL_COMPILER_IBM)
-
-#    define _ATTRIBUTE(x) __attribute__((x))
-
-#    define _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-#    define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
-
-#    if defined(_AIX)
-#      define __MULTILOCALE_API
-#    endif
-
 #    define _LIBCUDACXX_HAS_NO_VECTOR_EXTENSION
 
 #  elif defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_COMPILER_NVHPC)
@@ -900,16 +864,7 @@ typedef __char32_t char32_t;
 
 #    define _LIBCUDACXX_DISABLE_EXTENSION_WARNING
 
-#  endif // _CCCL_COMPILER_[CLANG|GCC|MSVC|IBM|NVRTC]
-
-#  if defined(_CCCL_CUDA_COMPILER_NVHPC)
-// Forcefully disable visibility controls when used as the standard library with NVC++.
-// TODO: reevaluate.
-#    define _LIBCUDACXX_HIDE_FROM_ABI
-#    ifndef _LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
-#      define _LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
-#    endif // !_LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
-#  endif // _CCCL_CUDA_COMPILER_NVHPC
+#  endif // _CCCL_COMPILER_[CLANG|GCC|MSVC|NVRTC]
 
 #  ifndef _LIBCUDACXX_FREESTANDING
 #    define _LIBCUDACXX_FREESTANDING
@@ -940,7 +895,7 @@ typedef __char32_t char32_t;
 #      endif // !__has_feature(address_sanitizer)
 #    else
 #      define _LIBCUDACXX_HAS_NO_ASAN
-#    endif // _CCCL_COMPILER[MSVC|IBM|NVHPC|NVRTC]
+#    endif // _CCCL_COMPILER[MSVC|NVHPC|NVRTC]
 #  endif // _LIBCUDACXX_HAS_NO_ASAN
 
 #  ifndef _LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS
@@ -995,13 +950,6 @@ typedef __char32_t char32_t;
 #    define _LIBCUDACXX_HAS_NO_WCHAR_H
 #  endif // _LIBCUDACXX_HAS_NO_WCHAR_H
 
-#  ifndef _LIBCUDACXX_NO_EXCEPTIONS
-#    if !defined(LIBCUDACXX_ENABLE_EXCEPTIONS) || (defined(_CCCL_COMPILER_MSVC) && _HAS_EXCEPTIONS == 0) \
-      || (!defined(_CCCL_COMPILER_MSVC) && !__EXCEPTIONS) // Catches all non msvc based compilers
-#      define _LIBCUDACXX_NO_EXCEPTIONS
-#    endif
-#  endif // !_LIBCUDACXX_NO_EXCEPTIONS
-
 // Try to find out if RTTI is disabled.
 // g++ and cl.exe have RTTI on by default and define a macro when it is.
 #  ifndef _LIBCUDACXX_NO_RTTI
@@ -1018,183 +966,6 @@ typedef __char32_t char32_t;
 #    endif
 #  endif // !_LIBCUDACXX_NODEBUG_TYPE
 
-#  if defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
-
-#    ifdef _DLL
-#      define _LIBCUDACXX_CRT_FUNC __declspec(dllimport)
-#    else
-#      define _LIBCUDACXX_CRT_FUNC
-#    endif
-
-#    if defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#      define _LIBCUDACXX_DLL_VIS
-#      define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
-#      define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#      define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
-#      define _LIBCUDACXX_EXPORTED_FROM_ABI
-#    elif defined(_LIBCUDACXX_BUILDING_LIBRARY)
-#      define _LIBCUDACXX_DLL_VIS __declspec(dllexport)
-#      if defined(__MINGW32__)
-#        define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS _LIBCUDACXX_DLL_VIS
-#        define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#      else
-#        define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
-#        define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS _LIBCUDACXX_DLL_VIS
-#      endif
-#      define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS _LIBCUDACXX_DLL_VIS
-#      define _LIBCUDACXX_EXPORTED_FROM_ABI    __declspec(dllexport)
-#    else
-#      define _LIBCUDACXX_DLL_VIS                  __declspec(dllimport)
-#      define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS _LIBCUDACXX_DLL_VIS
-#      define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#      define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
-#      define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport)
-#    endif
-
-#    define _LIBCUDACXX_EXCEPTION_ABI _LIBCUDACXX_DLL_VIS
-#    define _LIBCUDACXX_HIDDEN
-#    define _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-#    define _LIBCUDACXX_TEMPLATE_VIS
-#    define _LIBCUDACXX_ENUM_VIS
-
-#  endif // defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
-
-#  ifndef _LIBCUDACXX_HIDDEN
-#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#      define _LIBCUDACXX_HIDDEN __attribute__((__visibility__("hidden")))
-#    else
-#      define _LIBCUDACXX_HIDDEN
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-// The inline should be removed once PR32114 is resolved
-#      define _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS inline _LIBCUDACXX_HIDDEN
-#    else
-#      define _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_TYPE_VIS
-#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#      define _LIBCUDACXX_TYPE_VIS _CCCL_VISIBILITY_DEFAULT
-#    else
-#      define _LIBCUDACXX_TYPE_VIS
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_TEMPLATE_VIS
-#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#      if __has_attribute(__type_visibility__)
-#        define _LIBCUDACXX_TEMPLATE_VIS _CCCL_TYPE_VISIBILITY_DEFAULT
-#      else
-#        define _LIBCUDACXX_TEMPLATE_VIS _CCCL_VISIBILITY_DEFAULT
-#      endif
-#    else
-#      define _LIBCUDACXX_TEMPLATE_VIS
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_EXPORTED_FROM_ABI
-#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#      define _LIBCUDACXX_EXPORTED_FROM_ABI _CCCL_VISIBILITY_DEFAULT
-#    else
-#      define _LIBCUDACXX_EXPORTED_FROM_ABI
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
-#    define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY
-#  endif // _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
-
-#  ifndef _LIBCUDACXX_EXCEPTION_ABI
-#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#      define _LIBCUDACXX_EXCEPTION_ABI _CCCL_VISIBILITY_DEFAULT
-#    else
-#      define _LIBCUDACXX_EXCEPTION_ABI
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_ENUM_VIS
-#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#      define _LIBCUDACXX_ENUM_VIS _CCCL_TYPE_VISIBILITY_DEFAULT
-#    else
-#      define _LIBCUDACXX_ENUM_VIS
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
-#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS) && __has_attribute(__type_visibility__)
-#      define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS _CCCL_VISIBILITY_DEFAULT
-#    else
-#      define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#    define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#  endif
-
-#  if __has_attribute(internal_linkage)
-#    define _LIBCUDACXX_INTERNAL_LINKAGE __attribute__((internal_linkage))
-#  else
-#    define _LIBCUDACXX_INTERNAL_LINKAGE _LIBCUDACXX_ALWAYS_INLINE
-#  endif
-
-#  if __has_attribute(exclude_from_explicit_instantiation)
-#    define _LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION __attribute__((__exclude_from_explicit_instantiation__))
-#  else
-// Try to approximate the effect of exclude_from_explicit_instantiation
-// (which is that entities are not assumed to be provided by explicit
-// template instantiations in the dylib) by always inlining those entities.
-#    define _LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION _LIBCUDACXX_ALWAYS_INLINE
-#  endif
-
-#  ifndef _LIBCUDACXX_HIDE_FROM_ABI_PER_TU
-#    ifndef _LIBCUDACXX_HIDE_FROM_ABI_PER_TU_BY_DEFAULT
-#      define _LIBCUDACXX_HIDE_FROM_ABI_PER_TU 0
-#    else
-#      define _LIBCUDACXX_HIDE_FROM_ABI_PER_TU 1
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
-#    ifdef _LIBCUDACXX_OBJECT_FORMAT_COFF // Windows binaries can't merge typeinfos.
-#      define _LIBCUDACXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 0
-#    else
-// TODO: This isn't strictly correct on ELF platforms due to llvm.org/PR37398
-// And we should consider defaulting to OFF.
-#      define _LIBCUDACXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 1
-#    endif
-#  endif
-
-#  ifndef _LIBCUDACXX_HIDE_FROM_ABI
-#    if _LIBCUDACXX_HIDE_FROM_ABI_PER_TU
-#      define _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_HIDDEN _LIBCUDACXX_INTERNAL_LINKAGE
-#    else
-#      define _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_HIDDEN _LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
-#    endif
-#  endif
-
-#  ifdef _LIBCUDACXX_BUILDING_LIBRARY
-#    if _LIBCUDACXX_ABI_VERSION > 1
-#      define _LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1 _LIBCUDACXX_HIDE_FROM_ABI
-#    else
-#      define _LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1
-#    endif
-#  else
-#    define _LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1 _LIBCUDACXX_HIDE_FROM_ABI
-#  endif
-
-// Just so we can migrate to the new macros gradually.
-
-#  ifdef __cuda_std__
-#    define _LIBCUDACXX_INLINE_VISIBILITY _CCCL_HOST_DEVICE
-#  else
-#    define _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_HIDE_FROM_ABI
-#  endif // __cuda_std__
-
 #  define _LIBCUDACXX_CONCAT1(_LIBCUDACXX_X, _LIBCUDACXX_Y) _LIBCUDACXX_X##_LIBCUDACXX_Y
 #  define _LIBCUDACXX_CONCAT(_LIBCUDACXX_X, _LIBCUDACXX_Y)  _LIBCUDACXX_CONCAT1(_LIBCUDACXX_X, _LIBCUDACXX_Y)
 
@@ -1275,8 +1046,7 @@ typedef unsigned int char32_t;
 #    define _LIBCUDACXX_NOALIAS
 #  endif
 
-#  if __has_feature(cxx_explicit_conversions) || defined(_CCCL_COMPILER_IBM) || defined(_CCCL_COMPILER_GCC) \
-    || defined(_CCCL_COMPILER_CLANG)
+#  if __has_feature(cxx_explicit_conversions) || defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_CLANG)
 #    define _LIBCUDACXX_EXPLICIT explicit
 #  else
 #    define _LIBCUDACXX_EXPLICIT
@@ -1286,13 +1056,19 @@ typedef unsigned int char32_t;
 #    define _LIBCUDACXX_HAS_NO_BUILTIN_OPERATOR_NEW_DELETE
 #  endif
 
-#  if __check_builtin(make_integer_seq) && !defined(_LIBCUDACXX_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE)
-#    define _LIBCUDACXX_HAS_MAKE_INTEGER_SEQ
+#  if !defined(_LIBCUDACXX_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE)
+#    if __check_builtin(make_integer_seq)
+#      define _LIBCUDACXX_HAS_MAKE_INTEGER_SEQ
+#    elif defined(_CCCL_COMPILER_MSVC)
+#      if _CCCL_MSVC_VERSION_FULL >= 190023918
+#        define _LIBCUDACXX_HAS_MAKE_INTEGER_SEQ
+#      endif
+#    endif
+#    if __check_builtin(integer_pack)
+#      define _LIBCUDACXX_HAS_INTEGER_PACK
+#    endif
 #  endif
 
-#  define _LIBCUDACXX_DECLARE_STRONG_ENUM(x) enum class _LIBCUDACXX_ENUM_VIS x
-#  define _LIBCUDACXX_DECLARE_STRONG_ENUM_EPILOG(x)
-
 #  ifdef _LIBCUDACXX_DEBUG
 #    if _LIBCUDACXX_DEBUG == 0
 #      define _LIBCUDACXX_DEBUG_LEVEL 1
@@ -1308,37 +1084,12 @@ typedef unsigned int char32_t;
 
 #  ifdef _LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
 #    define _LIBCUDACXX_EXTERN_TEMPLATE(...)
-#    define _LIBCUDACXX_EXTERN_TEMPLATE2(...)
 #  endif
 
 #  ifndef _LIBCUDACXX_EXTERN_TEMPLATE
 #    define _LIBCUDACXX_EXTERN_TEMPLATE(...) extern template __VA_ARGS__;
 #  endif
 
-#  ifndef _LIBCUDACXX_EXTERN_TEMPLATE2
-#    define _LIBCUDACXX_EXTERN_TEMPLATE2(...) extern template __VA_ARGS__;
-#  endif
-
-#  if defined(__APPLE__) || defined(__FreeBSD__) || defined(_LIBCUDACXX_MSVCRT_LIKE) || defined(__sun__) \
-    || defined(__NetBSD__) || defined(__CloudABI__)
-#    define _LIBCUDACXX_LOCALE__L_EXTENSIONS 1
-#  endif
-
-#  if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
-// Most unix variants have catopen.  These are the specific ones that don't.
-#    if !defined(__BIONIC__) && !defined(_NEWLIB_VERSION)
-#      define _LIBCUDACXX_HAS_CATOPEN 1
-#    endif
-#  endif
-
-#  ifdef __FreeBSD__
-#    define _DECLARE_C99_LDBL_MATH 1
-#  endif
-
-#  if defined(_LIBCUDACXX_ABI_MICROSOFT) && !defined(_LIBCUDACXX_NO_VCRUNTIME)
-#    define _LIBCUDACXX_DEFER_NEW_TO_VCRUNTIME
-#  endif
-
 // If we are getting operator new from the MSVC CRT, then allocation overloads
 // for align_val_t were added in 19.12, aka VS 2017 version 15.3.
 #  if defined(_LIBCUDACXX_MSVCRT) && defined(_CCCL_COMPILER_MSVC) && _MSC_VER < 1912
@@ -1349,12 +1100,6 @@ typedef unsigned int char32_t;
 #    define _LIBCUDACXX_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
 #  endif
 
-#  if defined(__APPLE__)
-#    if !defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
-#      define __MAC_OS_X_VERSION_MIN_REQUIRED __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
-#    endif
-#  endif // defined(__APPLE__)
-
 #  if !defined(_LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION)                     \
       && (defined(_LIBCUDACXX_HAS_NO_LIBRARY_ALIGNED_ALLOCATION)          \
           || (!defined(__cpp_aligned_new) || __cpp_aligned_new < 201606)) \
@@ -1362,14 +1107,6 @@ typedef unsigned int char32_t;
 #    define _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION
 #  endif
 
-#  if defined(__APPLE__) || defined(__FreeBSD__)
-#    define _LIBCUDACXX_HAS_DEFAULTRUNELOCALE
-#  endif
-
-#  if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun__)
-#    define _LIBCUDACXX_WCTYPE_IS_MASK
-#  endif
-
 #  if _CCCL_STD_VER <= 2017 || !defined(__cpp_char8_t)
 #    define _LIBCUDACXX_NO_HAS_CHAR8_T
 #  endif
@@ -1411,12 +1148,6 @@ typedef unsigned int char32_t;
 #    define _LIBCUDACXX_DEPRECATED_IN_CXX20
 #  endif
 
-#  if _CCCL_STD_VER <= 2011
-#    define _LIBCUDACXX_EXPLICIT_AFTER_CXX11
-#  else
-#    define _LIBCUDACXX_EXPLICIT_AFTER_CXX11 explicit
-#  endif
-
 #  if _CCCL_STD_VER > 2014 && defined(__cpp_inline_variables) && (__cpp_inline_variables >= 201606L)
 #    define _LIBCUDACXX_INLINE_VAR inline
 #  else
@@ -1429,21 +1160,11 @@ typedef unsigned int char32_t;
 #    define _LIBCUDACXX_EXPLICIT_MOVE(x) (x)
 #  endif
 
-#  if __has_attribute(no_destroy)
-#    define _LIBCUDACXX_NO_DESTROY __attribute__((__no_destroy__))
-#  else
-#    define _LIBCUDACXX_NO_DESTROY
-#  endif
-
 #  ifndef _LIBCUDACXX_HAS_NO_ASAN
-extern "C" _LIBCUDACXX_INLINE_VISIBILITY void
+extern "C" _LIBCUDACXX_HIDE_FROM_ABI void
 __sanitizer_annotate_contiguous_container(const void*, const void*, const void*, const void*);
 #  endif
 
-#  ifndef _LIBCUDACXX_WEAK
-#    define _LIBCUDACXX_WEAK __attribute__((__weak__))
-#  endif
-
 // Thread API
 #  ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
 #    if defined(_CCCL_COMPILER_NVRTC) || defined(__EMSCRIPTEN__)
@@ -1504,59 +1225,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
 #    define __STDCPP_THREADS__ 1
 #  endif
 
-// The glibc and Bionic implementation of pthreads implements
-// pthread_mutex_destroy as nop for regular mutexes. Additionally, Win32
-// mutexes have no destroy mechanism.
-//
-// This optimization can't be performed on Apple platforms, where
-// pthread_mutex_destroy can allow the kernel to release resources.
-// See https://llvm.org/D64298 for details.
-//
-// TODO(EricWF): Enable this optimization on Bionic after speaking to their
-//               respective stakeholders.
-#  if (defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD) && defined(__GLIBC__)) || defined(_LIBCUDACXX_HAS_THREAD_API_WIN32)
-#    define _LIBCUDACXX_HAS_TRIVIAL_MUTEX_DESTRUCTION
-#  endif
-
-// Destroying a condvar is a nop on Windows.
-//
-// This optimization can't be performed on Apple platforms, where
-// pthread_cond_destroy can allow the kernel to release resources.
-// See https://llvm.org/D64298 for details.
-//
-// TODO(EricWF): This is potentially true for some pthread implementations
-// as well.
-#  if defined(_LIBCUDACXX_HAS_THREAD_API_WIN32)
-#    define _LIBCUDACXX_HAS_TRIVIAL_CONDVAR_DESTRUCTION
-#  endif
-
-// Systems that use capability-based security (FreeBSD with Capsicum,
-// Nuxi CloudABI) may only provide local filesystem access (using *at()).
-// Functions like open(), rename(), unlink() and stat() should not be
-// used, as they attempt to access the global filesystem namespace.
-#  ifdef __CloudABI__
-#    define _LIBCUDACXX_HAS_NO_GLOBAL_FILESYSTEM_NAMESPACE
-#  endif
-
-// CloudABI is intended for running networked services. Processes do not
-// have standard input and output channels.
-#  ifdef __CloudABI__
-#    define _LIBCUDACXX_HAS_NO_STDIN
-#    define _LIBCUDACXX_HAS_NO_STDOUT
-#  endif
-
-// Some systems do not provide gets() in their C library, for security reasons.
-#  ifndef _LIBCUDACXX_C_HAS_NO_GETS
-#    if defined(_LIBCUDACXX_MSVCRT) || (defined(__FreeBSD__) && __FreeBSD__ >= 13)
-#      define _LIBCUDACXX_C_HAS_NO_GETS
-#    endif
-#  endif
-
-#  if defined(__BIONIC__) || defined(__CloudABI__) || defined(__Fuchsia__) || defined(__wasi__) \
-    || defined(_LIBCUDACXX_HAS_MUSL_LIBC)
-#    define _LIBCUDACXX_PROVIDES_DEFAULT_RUNE_TABLE
-#  endif
-
 // Thread-unsafe functions such as strtok() and localtime()
 // are not available.
 #  ifdef __CloudABI__
@@ -1605,29 +1273,12 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
 #    define _LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
 #  endif
 
-#  if defined(_LIBCUDACXX_ENABLE_THREAD_SAFETY_ANNOTATIONS)
-#    if defined(_CCCL_COMPILER_CLANG) && __has_attribute(acquire_capability)
-// Work around the attribute handling in clang.  When both __declspec and
-// __attribute__ are present, the processing goes awry preventing the definition
-// of the types.
-#      if !defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
-#        define _LIBCUDACXX_HAS_THREAD_SAFETY_ANNOTATIONS
-#      endif
-#    endif
-#  endif
-
 #  if __has_attribute(require_constant_initialization)
 #    define _LIBCUDACXX_SAFE_STATIC __attribute__((__require_constant_initialization__))
 #  else
 #    define _LIBCUDACXX_SAFE_STATIC
 #  endif
 
-#  if !defined(_LIBCUDACXX_HAS_NO_OFF_T_FUNCTIONS)
-#    if defined(_LIBCUDACXX_MSVCRT) || defined(_NEWLIB_VERSION)
-#      define _LIBCUDACXX_HAS_NO_OFF_T_FUNCTIONS
-#    endif
-#  endif
-
 #  if __has_attribute(diagnose_if) && !defined(_LIBCUDACXX_DISABLE_ADDITIONAL_DIAGNOSTICS)
 #    define _LIBCUDACXX_DIAGNOSE_WARNING(...) __attribute__((diagnose_if(__VA_ARGS__, "warning")))
 #    define _LIBCUDACXX_DIAGNOSE_ERROR(...)   __attribute__((diagnose_if(__VA_ARGS__, "error")))
@@ -1665,10 +1316,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
 #    define _LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES
 #  endif
 
-#  if !defined(__cpp_coroutines) || __cpp_coroutines < 201703L
-#    define _LIBCUDACXX_HAS_NO_COROUTINES
-#  endif
-
 // We need `is_constant_evaluated` for clang and gcc. MSVC also needs extensive rework
 #  if !defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
 #    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
@@ -1689,34 +1336,8 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
 
 #  define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
 
-// The stream API was dropped and re-added in the dylib shipped on macOS
-// and iOS. We can only assume the dylib to provide these definitions for
-// macosx >= 10.9 and ios >= 7.0. Otherwise, the definitions are available
-// from the headers, but not from the dylib. Explicit instantiation
-// declarations for streams exist conditionally to this; if we provide
-// an explicit instantiation declaration and we try to deploy to a dylib
-// that does not provide those symbols, we'll get a load-time error.
-#  if !defined(_LIBCUDACXX_BUILDING_LIBRARY)                        \
-    && ((defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)     \
-         && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 1090)   \
-        || (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) \
-            && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 70000))
-#    define _LIBCUDACXX_DO_NOT_ASSUME_STREAMS_EXPLICIT_INSTANTIATION_IN_DYLIB
-#  endif
-
 #  define _LIBCUDACXX_UNUSED_VAR(x) ((void) (x))
 
-// Configures the fopen close-on-exec mode character, if any. This string will
-// be appended to any mode string used by fstream for fopen/fdopen.
-//
-// Not all platforms support this, but it helps avoid fd-leaks on platforms that
-// do.
-#  if defined(__BIONIC__)
-#    define _LIBCUDACXX_FOPEN_CLOEXEC_MODE "e"
-#  else
-#    define _LIBCUDACXX_FOPEN_CLOEXEC_MODE
-#  endif
-
 #  if __has_attribute(__format__)
 // The attribute uses 1-based indices for ordinary and static member functions.
 // The attribute uses 2-based indices for non-static member functions.
@@ -1769,20 +1390,26 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
 #    define _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__class, __baseclass, ...)                                 \
       using __base = __baseclass<__VA_ARGS__>;                                                           \
       template <class... _Args, __enable_if_t<_CCCL_TRAIT(is_constructible, __base, _Args...), int> = 0> \
-      _LIBCUDACXX_INLINE_VISIBILITY constexpr __class(_Args&&... __args) noexcept(                       \
+      _LIBCUDACXX_HIDE_FROM_ABI constexpr __class(_Args&&... __args) noexcept(                           \
         _CCCL_TRAIT(is_nothrow_constructible, __base, _Args...))                                         \
           : __base(_CUDA_VSTD::forward<_Args>(__args)...)                                                \
       {}                                                                                                 \
-      constexpr __class() noexcept = default;
+      _CCCL_HIDE_FROM_ABI constexpr __class() noexcept = default;
 #  else // ^^^ _CCCL_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_CCCL_COMPILER_NVRTC || nvcc >= 11.3 vvv
 #    define _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__class, __baseclass, ...) \
       using __base = __baseclass<__VA_ARGS__>;                           \
       using __base::__base;                                              \
-      constexpr __class() noexcept = default;
+      _CCCL_HIDE_FROM_ABI constexpr __class() noexcept = default;
 #  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
 
 #  define _LIBCUDACXX_HAS_NO_INCOMPLETE_RANGES
 
+#  if _CCCL_STD_VER > 2014 && defined(__cpp_noexcept_function_type) && (__cpp_noexcept_function_type >= 201510L)
+#    define _LIBCUDACXX_FUNCTION_TYPE_NOEXCEPT noexcept
+#  else
+#    define _LIBCUDACXX_FUNCTION_TYPE_NOEXCEPT
+#  endif
+
 #endif // __cplusplus
 
 #endif // _LIBCUDACXX_CONFIG
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__debug b/libcudacxx/include/cuda/std/detail/libcxx/include/__debug
index b35b610ab5..ecac8a9167 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__debug
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__debug
@@ -43,9 +43,9 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-struct _LIBCUDACXX_TYPE_VIS __c_node;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __c_node;
 
-struct _LIBCUDACXX_TYPE_VIS __i_node
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __i_node
 {
   void* __i_;
   __i_node* __next_;
@@ -54,7 +54,7 @@ struct _LIBCUDACXX_TYPE_VIS __i_node
   __i_node(const __i_node&)            = delete;
   __i_node& operator=(const __i_node&) = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY __i_node(void* __i, __i_node* __next, __c_node* __c)
+  _LIBCUDACXX_HIDE_FROM_ABI __i_node(void* __i, __i_node* __next, __c_node* __c)
       : __i_(__i)
       , __next_(__next)
       , __c_(__c)
@@ -62,7 +62,7 @@ struct _LIBCUDACXX_TYPE_VIS __i_node
   ~__i_node();
 };
 
-struct _LIBCUDACXX_TYPE_VIS __c_node
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __c_node
 {
   void* __c_;
   __c_node* __next_;
@@ -73,7 +73,7 @@ struct _LIBCUDACXX_TYPE_VIS __c_node
   __c_node(const __c_node&)            = delete;
   __c_node& operator=(const __c_node&) = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __c_node(void* __c, __c_node* __next)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __c_node(void* __c, __c_node* __next)
       : __c_(__c)
       , __next_(__next)
       , beg_(nullptr)
@@ -88,7 +88,7 @@ struct _LIBCUDACXX_TYPE_VIS __c_node
   virtual bool __subscriptable(const void*, ptrdiff_t) const = 0;
 
   void __add(__i_node* __i);
-  _LIBCUDACXX_HIDDEN void __remove(__i_node* __i);
+  _CCCL_VISIBILITY_HIDDEN void __remove(__i_node* __i);
 };
 
 template <class _Cont>
@@ -140,7 +140,7 @@ inline bool _C_node<_Cont>::__subscriptable(const void* __i, ptrdiff_t __n) cons
   return _Cp->__subscriptable(__j, __n);
 }
 
-class _LIBCUDACXX_TYPE_VIS __libcpp_db
+class _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_db
 {
   __c_node** __cbeg_;
   __c_node** __cend_;
@@ -168,13 +168,13 @@ public:
   typedef __c_node*(_InsertConstruct) (void*, void*, __c_node*);
 
   template <class _Cont>
-  _LIBCUDACXX_INLINE_VISIBILITY static __c_node* __create_C_node(void* __mem, void* __c, __c_node* __next)
+  _LIBCUDACXX_HIDE_FROM_ABI static __c_node* __create_C_node(void* __mem, void* __c, __c_node* __next)
   {
     return ::new (__mem) _C_node<_Cont>(__c, __next);
   }
 
   template <class _Cont>
-  _LIBCUDACXX_INLINE_VISIBILITY void __insert_c(_Cont* __c)
+  _LIBCUDACXX_HIDE_FROM_ABI void __insert_c(_Cont* __c)
   {
     __insert_c(static_cast<void*>(__c), &__create_C_node<_Cont>);
   }
@@ -202,14 +202,14 @@ public:
   bool __less_than_comparable(const void* __i, const void* __j) const;
 
 private:
-  _LIBCUDACXX_HIDDEN __i_node* __insert_iterator(void* __i);
-  _LIBCUDACXX_HIDDEN __i_node* __find_iterator(const void* __i) const;
+  _CCCL_VISIBILITY_HIDDEN __i_node* __insert_iterator(void* __i);
+  _CCCL_VISIBILITY_HIDDEN __i_node* __find_iterator(const void* __i) const;
 
-  friend _LIBCUDACXX_INLINE_VISIBILITY __libcpp_db* __get_db();
+  friend _LIBCUDACXX_HIDE_FROM_ABI __libcpp_db* __get_db();
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY __libcpp_db* __get_db();
-_LIBCUDACXX_INLINE_VISIBILITY const __libcpp_db* __get_const_db();
+_LIBCUDACXX_HIDE_FROM_ABI __libcpp_db* __get_db();
+_LIBCUDACXX_HIDE_FROM_ABI const __libcpp_db* __get_const_db();
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
@@ -218,7 +218,7 @@ _LIBCUDACXX_END_NAMESPACE_STD
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 inline void __debug_db_insert_c(_Tp* __c)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __debug_db_insert_c(_Tp* __c)
 {
 #ifdef _LIBCUDACXX_ENABLE_DEBUG_MODE
   if (!__libcpp_is_constant_evaluated())
@@ -231,7 +231,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 inline void __debug_db_inser
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 inline void __debug_db_insert_i(_Tp* __i)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __debug_db_insert_i(_Tp* __i)
 {
 #ifdef _LIBCUDACXX_ENABLE_DEBUG_MODE
   if (!__libcpp_is_constant_evaluated())
@@ -244,7 +244,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 inline void __debug_db_inser
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 inline void __debug_db_erase_c(_Tp* __c)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __debug_db_erase_c(_Tp* __c)
 {
 #ifdef _LIBCUDACXX_ENABLE_DEBUG_MODE
   if (!__libcpp_is_constant_evaluated())
@@ -257,7 +257,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 inline void __debug_db_erase
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 inline void __debug_db_swap(_Tp* __lhs, _Tp* __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __debug_db_swap(_Tp* __lhs, _Tp* __rhs)
 {
 #ifdef _LIBCUDACXX_ENABLE_DEBUG_MODE
   if (!__libcpp_is_constant_evaluated())
@@ -271,7 +271,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 inline void __debug_db_swap(
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 inline void __debug_db_invalidate_all(_Tp* __c)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __debug_db_invalidate_all(_Tp* __c)
 {
 #ifdef _LIBCUDACXX_ENABLE_DEBUG_MODE
   if (!__libcpp_is_constant_evaluated())
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__functional_base b/libcudacxx/include/cuda/std/detail/libcxx/include/__functional_base
deleted file mode 100644
index fa6c25d0a3..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__functional_base
+++ /dev/null
@@ -1,37 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_FUNCTIONAL_BASE
-#define _LIBCUDACXX_FUNCTIONAL_BASE
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__functional/binary_function.h>
-#include <cuda/std/__functional/operations.h>
-#include <cuda/std/__functional/reference_wrapper.h>
-#include <cuda/std/__functional/unary_function.h>
-#include <cuda/std/__functional/weak_result_type.h>
-#include <cuda/std/__memory/allocator_arg_t.h>
-#include <cuda/std/__memory/uses_allocator.h>
-#include <cuda/std/__type_traits/integral_constant.h>
-#include <cuda/std/__type_traits/is_constructible.h>
-#include <cuda/std/__type_traits/is_convertible.h>
-#include <cuda/std/__type_traits/remove_cvref.h>
-#include <cuda/std/__utility/forward.h>
-#include <cuda/std/detail/libcxx/include/new>
-
-#endif // _LIBCUDACXX_FUNCTIONAL_BASE
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__string b/libcudacxx/include/cuda/std/detail/libcxx/include/__string
index 1f0517b99d..8e6f13469e 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__string
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__string
@@ -80,7 +80,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // char_traits
 
 template <class _CharT>
-struct _LIBCUDACXX_TEMPLATE_VIS char_traits
+struct _CCCL_TYPE_VISIBILITY_DEFAULT char_traits
 {
   typedef _CharT char_type;
   typedef int int_type;
@@ -90,49 +90,48 @@ struct _LIBCUDACXX_TEMPLATE_VIS char_traits
   typedef mbstate_t state_type;
 #endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
 
-  _LIBCUDACXX_INLINE_VISIBILITY static inline void _CCCL_CONSTEXPR_CXX14
-  assign(char_type& __c1, const char_type& __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static void _CCCL_CONSTEXPR_CXX14 assign(char_type& __c1, const char_type& __c2) noexcept
   {
     __c1 = __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq(char_type __c1, char_type __c2) noexcept
   {
     return __c1 == __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool lt(char_type __c1, char_type __c2) noexcept
   {
     return __c1 < __c2;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 int
   compare(const char_type* __s1, const char_type* __s2, size_t __n);
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s);
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type*
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s);
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a);
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n);
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n);
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a);
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* move(char_type* __s1, const char_type* __s2, size_t __n);
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n);
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* assign(char_type* __s, size_t __n, char_type __a);
 
 #ifndef __cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type not_eof(int_type __c) noexcept
   {
     return eq_int_type(__c, eof()) ? ~eof() : __c;
   }
 #endif // !__cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr char_type to_char_type(int_type __c) noexcept
   {
     return char_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type to_int_type(char_type __c) noexcept
   {
     return int_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
   {
     return __c1 == __c2;
   }
 #ifndef __cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type eof() noexcept
   {
     return int_type(EOF);
   }
@@ -183,8 +182,7 @@ char_traits<_CharT>::find(const char_type* __s, size_t __n, const char_type& __a
 }
 
 template <class _CharT>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CharT*
-char_traits<_CharT>::move(char_type* __s1, const char_type* __s2, size_t __n)
+_LIBCUDACXX_HIDE_FROM_ABI _CharT* char_traits<_CharT>::move(char_type* __s1, const char_type* __s2, size_t __n)
 {
   char_type* __r = __s1;
   if (__s1 < __s2)
@@ -207,8 +205,7 @@ char_traits<_CharT>::move(char_type* __s1, const char_type* __s2, size_t __n)
 }
 
 template <class _CharT>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CharT*
-char_traits<_CharT>::copy(char_type* __s1, const char_type* __s2, size_t __n)
+_LIBCUDACXX_HIDE_FROM_ABI _CharT* char_traits<_CharT>::copy(char_type* __s1, const char_type* __s2, size_t __n)
 {
   _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
   char_type* __r = __s1;
@@ -220,7 +217,7 @@ char_traits<_CharT>::copy(char_type* __s1, const char_type* __s2, size_t __n)
 }
 
 template <class _CharT>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CharT* char_traits<_CharT>::assign(char_type* __s, size_t __n, char_type __a)
+_LIBCUDACXX_HIDE_FROM_ABI _CharT* char_traits<_CharT>::assign(char_type* __s, size_t __n, char_type __a)
 {
   char_type* __r = __s;
   for (; __n; --__n, ++__s)
@@ -239,7 +236,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CharT* char_traits<_CharT>::assign(char_ty
 #endif
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT char_traits<char>
 {
   typedef char char_type;
   typedef int int_type;
@@ -249,23 +246,22 @@ struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char>
   typedef mbstate_t state_type;
 #endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
 
-  _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void
-  assign(char_type& __c1, const char_type& __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 void assign(char_type& __c1, const char_type& __c2) noexcept
   {
     __c1 = __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq(char_type __c1, char_type __c2) noexcept
   {
     return __c1 == __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool lt(char_type __c1, char_type __c2) noexcept
   {
     return (unsigned char) __c1 < (unsigned char) __c2;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 int
   compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static inline size_t _CCCL_CONSTEXPR_CXX14 length(const char_type* __s) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static size_t _CCCL_CONSTEXPR_CXX14 length(const char_type* __s) noexcept
   {
 #ifdef _CCCL_HAS_NO_BUILTIN_STRLEN
 #  ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED // is_constant_evaluated only exists since GCC 9
@@ -286,44 +282,42 @@ struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char>
                       (return __builtin_strlen(__s);))
 #endif // !defined(_CCCL_HAS_NO_BUILTIN_STRLEN) || defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type*
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type*
-  move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
   {
     return __n == 0 ? __s1 : (char_type*) __copy<_ClassicAlgPolicy>(__s2, __s2 + __n, __s1).first - __n;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type*
-  copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
   {
     _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
     return __n == 0 ? __s1 : (char_type*) memcpy(__s1, __s2, __n);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept
   {
     return __n == 0 ? __s : (char_type*) memset(__s, to_int_type(__a), __n);
   }
 
 #ifndef __cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type not_eof(int_type __c) noexcept
   {
     return eq_int_type(__c, eof()) ? ~eof() : __c;
   }
 #endif // !__cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr char_type to_char_type(int_type __c) noexcept
   {
     return char_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type to_int_type(char_type __c) noexcept
   {
     return int_type((unsigned char) __c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
   {
     return __c1 == __c2;
   }
 #ifndef __cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type eof() noexcept
   {
     return int_type(EOF);
   }
@@ -381,7 +375,7 @@ char_traits<char>::find(const char_type* __s, size_t __n, const char_type& __a)
 
 #ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS char_traits<wchar_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT char_traits<wchar_t>
 {
   typedef wchar_t char_type;
   typedef wint_t int_type;
@@ -389,16 +383,15 @@ struct _LIBCUDACXX_TEMPLATE_VIS char_traits<wchar_t>
   typedef streampos pos_type;
   typedef mbstate_t state_type;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void
-  assign(char_type& __c1, const char_type& __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 void assign(char_type& __c1, const char_type& __c2) noexcept
   {
     __c1 = __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq(char_type __c1, char_type __c2) noexcept
   {
     return __c1 == __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool lt(char_type __c1, char_type __c2) noexcept
   {
     return __c1 < __c2;
   }
@@ -406,42 +399,40 @@ struct _LIBCUDACXX_TEMPLATE_VIS char_traits<wchar_t>
   static _CCCL_CONSTEXPR_CXX14 int compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
   static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept;
   static _CCCL_CONSTEXPR_CXX14 const char_type* find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type*
-  move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
   {
     return __n == 0 ? __s1 : (char_type*) wmemmove(__s1, __s2, __n);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type*
-  copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
   {
     _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
     return __n == 0 ? __s1 : (char_type*) wmemcpy(__s1, __s2, __n);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept
   {
     return __n == 0 ? __s : (char_type*) wmemset(__s, __a, __n);
   }
 
 #  ifndef __cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type not_eof(int_type __c) noexcept
   {
     return eq_int_type(__c, eof()) ? ~eof() : __c;
   }
 #  endif // !__cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr char_type to_char_type(int_type __c) noexcept
   {
     return char_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type to_int_type(char_type __c) noexcept
   {
     return int_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
   {
     return __c1 == __c2;
   }
 #  ifndef __cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type eof() noexcept
   {
     return int_type(WEOF);
   }
@@ -477,8 +468,7 @@ char_traits<wchar_t>::compare(const char_type* __s1, const char_type* __s2, size
 #endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
 
 template <class _Traits>
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr size_t
-__char_traits_length_checked(const typename _Traits::char_type* __s) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr size_t __char_traits_length_checked(const typename _Traits::char_type* __s) noexcept
 {
 #if _LIBCUDACXX_DEBUG_LEVEL >= 1
   return __s
@@ -534,7 +524,7 @@ char_traits<wchar_t>::find(const char_type* __s, size_t __n, const char_type& __
 #ifndef _LIBCUDACXX_NO_HAS_CHAR8_T
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char8_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT char_traits<char8_t>
 {
   typedef char8_t char_type;
   typedef unsigned int int_type;
@@ -544,63 +534,63 @@ struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char8_t>
   typedef mbstate_t state_type;
 #  endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
 
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr void assign(char_type& __c1, const char_type& __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr void assign(char_type& __c1, const char_type& __c2) noexcept
   {
     __c1 = __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq(char_type __c1, char_type __c2) noexcept
   {
     return __c1 == __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool lt(char_type __c1, char_type __c2) noexcept
   {
     return __c1 < __c2;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr int
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int
   compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr size_t length(const char_type* __s) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t length(const char_type* __s) noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr const char_type*
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
   {
     return __n == 0 ? __s1 : (char_type*) __copy<_ClassicAlgPolicy>(__s2, __s2 + __n, __s1).first - __n;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
   {
     _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
     return __n == 0 ? __s1 : (char_type*) memcpy(__s1, __s2, __n);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept
   {
     return __n == 0 ? __s : (char_type*) memset(__s, to_int_type(__a), __n);
   }
 
 #  ifndef __cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type not_eof(int_type __c) noexcept
   {
     return eq_int_type(__c, eof()) ? ~eof() : __c;
   }
 #  endif // !__cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr char_type to_char_type(int_type __c) noexcept
   {
     return char_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type to_int_type(char_type __c) noexcept
   {
     return int_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
   {
     return __c1 == __c2;
   }
 #  ifndef __cuda_std__
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type eof() noexcept
   {
     return int_type(EOF);
   }
@@ -658,7 +648,7 @@ char_traits<char8_t>::find(const char_type* __s, size_t __n, const char_type& __
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char16_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT char_traits<char16_t>
 {
   typedef char16_t char_type;
   typedef uint_least16_t int_type;
@@ -668,46 +658,45 @@ struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char16_t>
   typedef mbstate_t state_type;
 #  endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
 
-  _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void
-  assign(char_type& __c1, const char_type& __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 void assign(char_type& __c1, const char_type& __c2) noexcept
   {
     __c1 = __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq(char_type __c1, char_type __c2) noexcept
   {
     return __c1 == __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool lt(char_type __c1, char_type __c2) noexcept
   {
     return __c1 < __c2;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 int
   compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type*
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type not_eof(int_type __c) noexcept
   {
     return eq_int_type(__c, eof()) ? ~eof() : __c;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr char_type to_char_type(int_type __c) noexcept
   {
     return char_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type to_int_type(char_type __c) noexcept
   {
     return int_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
   {
     return __c1 == __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type eof() noexcept
   {
     return int_type(0xFFFF);
   }
@@ -754,7 +743,7 @@ char_traits<char16_t>::find(const char_type* __s, size_t __n, const char_type& _
   return 0;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY char16_t*
+_LIBCUDACXX_HIDE_FROM_ABI char16_t*
 char_traits<char16_t>::move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
 {
   char_type* __r = __s1;
@@ -777,7 +766,7 @@ char_traits<char16_t>::move(char_type* __s1, const char_type* __s2, size_t __n)
   return __r;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY char16_t*
+_LIBCUDACXX_HIDE_FROM_ABI char16_t*
 char_traits<char16_t>::copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
 {
   _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
@@ -789,8 +778,7 @@ char_traits<char16_t>::copy(char_type* __s1, const char_type* __s2, size_t __n)
   return __r;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY char16_t*
-char_traits<char16_t>::assign(char_type* __s, size_t __n, char_type __a) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI char16_t* char_traits<char16_t>::assign(char_type* __s, size_t __n, char_type __a) noexcept
 {
   char_type* __r = __s;
   for (; __n; --__n, ++__s)
@@ -801,7 +789,7 @@ char_traits<char16_t>::assign(char_type* __s, size_t __n, char_type __a) noexcep
 }
 
 template <>
-struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char32_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT char_traits<char32_t>
 {
   typedef char32_t char_type;
   typedef uint_least32_t int_type;
@@ -811,46 +799,45 @@ struct _LIBCUDACXX_TEMPLATE_VIS char_traits<char32_t>
   typedef mbstate_t state_type;
 #  endif // !_LIBCUDACXX_HAS_NO_WCHAR_H
 
-  _LIBCUDACXX_INLINE_VISIBILITY static inline _CCCL_CONSTEXPR_CXX14 void
-  assign(char_type& __c1, const char_type& __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 void assign(char_type& __c1, const char_type& __c2) noexcept
   {
     __c1 = __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq(char_type __c1, char_type __c2) noexcept
   {
     return __c1 == __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool lt(char_type __c1, char_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool lt(char_type __c1, char_type __c2) noexcept
   {
     return __c1 < __c2;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 int
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 int
   compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static _CCCL_CONSTEXPR_CXX14 const char_type*
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 size_t length(const char_type* __s) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static _CCCL_CONSTEXPR_CXX14 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static char_type* assign(char_type* __s, size_t __n, char_type __a) noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type not_eof(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type not_eof(int_type __c) noexcept
   {
     return eq_int_type(__c, eof()) ? ~eof() : __c;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr char_type to_char_type(int_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr char_type to_char_type(int_type __c) noexcept
   {
     return char_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type to_int_type(char_type __c) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type to_int_type(char_type __c) noexcept
   {
     return int_type(__c);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept
   {
     return __c1 == __c2;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr int_type eof() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr int_type eof() noexcept
   {
     return int_type(0xFFFFFFFF);
   }
@@ -897,7 +884,7 @@ char_traits<char32_t>::find(const char_type* __s, size_t __n, const char_type& _
   return 0;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY char32_t*
+_LIBCUDACXX_HIDE_FROM_ABI char32_t*
 char_traits<char32_t>::move(char_type* __s1, const char_type* __s2, size_t __n) noexcept
 {
   char_type* __r = __s1;
@@ -920,7 +907,7 @@ char_traits<char32_t>::move(char_type* __s1, const char_type* __s2, size_t __n)
   return __r;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY char32_t*
+_LIBCUDACXX_HIDE_FROM_ABI char32_t*
 char_traits<char32_t>::copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept
 {
   _LIBCUDACXX_ASSERT(__s2 < __s1 || __s2 >= __s1 + __n, "char_traits::copy overlapped range");
@@ -932,8 +919,7 @@ char_traits<char32_t>::copy(char_type* __s1, const char_type* __s2, size_t __n)
   return __r;
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY char32_t*
-char_traits<char32_t>::assign(char_type* __s, size_t __n, char_type __a) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI char32_t* char_traits<char32_t>::assign(char_type* __s, size_t __n, char_type __a) noexcept
 {
   char_type* __r = __s;
   for (; __n; --__n, ++__s)
@@ -949,7 +935,7 @@ char_traits<char32_t>::assign(char_type* __s, size_t __n, char_type __a) noexcep
 
 // __str_find
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_find(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
 {
   if (__pos >= __sz)
@@ -965,7 +951,7 @@ __str_find(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
 }
 
 template <class _CharT, class _Traits>
-inline _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY const _CharT*
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _CharT*
 __search_substring(const _CharT* __first1, const _CharT* __last1, const _CharT* __first2, const _CharT* __last2)
 {
   // Take advantage of knowing source and pattern lengths.
@@ -1015,7 +1001,7 @@ __search_substring(const _CharT* __first1, const _CharT* __last1, const _CharT*
 }
 
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_find(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
 {
   if (__pos > __sz)
@@ -1040,7 +1026,7 @@ __str_find(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _Siz
 // __str_rfind
 
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_rfind(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
 {
   if (__sz < 1)
@@ -1066,7 +1052,7 @@ __str_rfind(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
 }
 
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_rfind(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
 {
   __pos = _CUDA_VSTD::min(__pos, __sz);
@@ -1089,7 +1075,7 @@ __str_rfind(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _Si
 
 // __str_find_first_of
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_find_first_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
 {
   if (__pos >= __sz || __n == 0)
@@ -1106,7 +1092,7 @@ __str_find_first_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __
 
 // __str_find_last_of
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_find_last_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
 {
   if (__n != 0)
@@ -1133,7 +1119,7 @@ __str_find_last_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __p
 
 // __str_find_first_not_of
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_find_first_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
 {
   if (__pos < __sz)
@@ -1151,7 +1137,7 @@ __str_find_first_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _Size
 }
 
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_find_first_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
 {
   if (__pos < __sz)
@@ -1170,7 +1156,7 @@ __str_find_first_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos
 
 // __str_find_last_not_of
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_find_last_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) noexcept
 {
   if (__pos < __sz)
@@ -1192,7 +1178,7 @@ __str_find_last_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT
 }
 
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
-inline _SizeT _CCCL_CONSTEXPR_CXX14 _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _SizeT
 __str_find_last_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) noexcept
 {
   if (__pos < __sz)
@@ -1215,7 +1201,7 @@ __str_find_last_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos)
 
 #ifndef __cuda_std__
 template <class _Ptr>
-inline _LIBCUDACXX_INLINE_VISIBILITY size_t __do_string_hash(_Ptr __p, _Ptr __e)
+_LIBCUDACXX_HIDE_FROM_ABI size_t __do_string_hash(_Ptr __p, _Ptr __e)
 {
   typedef typename iterator_traits<_Ptr>::value_type value_type;
   return __murmur2_or_cityhash<size_t>()(__p, (__e - __p) * sizeof(value_type));
@@ -1230,7 +1216,7 @@ struct __quoted_output_proxy
   _CharT __delim;
   _CharT __escape;
 
-  _LIBCUDACXX_INLINE_VISIBILITY __quoted_output_proxy(_Iter __f, _Iter __l, _CharT __d, _CharT __e)
+  _LIBCUDACXX_HIDE_FROM_ABI __quoted_output_proxy(_Iter __f, _Iter __l, _CharT __d, _CharT __e)
       : __first(__f)
       , __last(__l)
       , __delim(__d)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
index 59d7721fb1..bcfffa5358 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
@@ -32,7 +32,7 @@ _CCCL_PUSH_MACROS
 #  ifndef __cuda_std__
 #    include <__external_threading>
 #  else
-#    define _LIBCUDACXX_THREAD_ABI_VISIBILITY inline _LIBCUDACXX_INLINE_VISIBILITY
+#    define _LIBCUDACXX_THREAD_ABI_VISIBILITY _LIBCUDACXX_HIDE_FROM_ABI
 #  endif
 #elif !defined(_LIBCUDACXX_HAS_NO_THREADS)
 
@@ -56,9 +56,9 @@ _CCCL_PUSH_MACROS
 #  endif
 
 #  if defined(_LIBCUDACXX_HAS_THREAD_LIBRARY_EXTERNAL) || defined(_LIBCUDACXX_BUILDING_THREAD_LIBRARY_EXTERNAL)
-#    define _LIBCUDACXX_THREAD_ABI_VISIBILITY _LIBCUDACXX_INLINE_VISIBILITY
+#    define _LIBCUDACXX_THREAD_ABI_VISIBILITY _LIBCUDACXX_HIDE_FROM_ABI
 #  else
-#    define _LIBCUDACXX_THREAD_ABI_VISIBILITY inline _LIBCUDACXX_INLINE_VISIBILITY
+#    define _LIBCUDACXX_THREAD_ABI_VISIBILITY _LIBCUDACXX_HIDE_FROM_ABI
 #  endif
 
 #  if defined(__FreeBSD__) && defined(_CCCL_COMPILER_CLANG) && __has_attribute(no_thread_safety_analysis)
@@ -76,13 +76,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #  define _LIBCUDACXX_POLLING_COUNT 16
 
-_LIBCUDACXX_INLINE_VISIBILITY inline void __libcpp_thread_yield_processor(){
+_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield_processor(){
 #  if defined(__aarch64__)
 #    define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile("yield" :::);)
 #  elif defined(__x86_64__)
 #    define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile("pause" :::);)
-#  elif defined(__powerpc__)
-#    define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile("or 27,27,27" :::);)
 #  else
 #    define __LIBCUDACXX_ASM_THREAD_YIELD (;)
 #  endif
@@ -684,32 +682,32 @@ struct alignas(64) __libcpp_contention_t
 #    endif
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY __libcpp_contention_t* __libcpp_contention_state(void const volatile* p) noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI __libcpp_contention_t* __libcpp_contention_state(void const volatile* p) noexcept;
 
 #  endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
 
 #  if !defined(_LIBCUDACXX_HAS_NO_TREE_BARRIER) && !defined(_LIBCUDACXX_HAS_NO_THREAD_FAVORITE_BARRIER_INDEX)
 
-_LIBCUDACXX_EXPORTED_FROM_ABI extern thread_local ptrdiff_t __libcpp_thread_favorite_barrier_index;
+_CCCL_VISIBILITY_DEFAULT extern thread_local ptrdiff_t __libcpp_thread_favorite_barrier_index;
 
 #  endif
 
 #  ifndef __cuda_std__
 
-class _LIBCUDACXX_TYPE_VIS thread;
-class _LIBCUDACXX_TYPE_VIS __thread_id;
+class _CCCL_TYPE_VISIBILITY_DEFAULT thread;
+class _CCCL_TYPE_VISIBILITY_DEFAULT __thread_id;
 
 namespace this_thread
 {
 
-_LIBCUDACXX_INLINE_VISIBILITY __thread_id get_id() noexcept;
+_LIBCUDACXX_HIDE_FROM_ABI __thread_id get_id() noexcept;
 
 } // namespace this_thread
 
 template <>
 struct hash<__thread_id>;
 
-class _LIBCUDACXX_TEMPLATE_VIS __thread_id
+class _CCCL_TYPE_VISIBILITY_DEFAULT __thread_id
 {
   // FIXME: pthread_t is a pointer on Darwin but a long on Linux.
   // NULL is the no-thread value on Darwin.  Someone needs to check
@@ -717,11 +715,11 @@ class _LIBCUDACXX_TEMPLATE_VIS __thread_id
   __libcpp_thread_id __id_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __thread_id() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI __thread_id() noexcept
       : __id_(0)
   {}
 
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator==(__thread_id __x, __thread_id __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator==(__thread_id __x, __thread_id __y) noexcept
   { // don't pass id==0 to underlying routines
     if (__x.__id_ == 0)
     {
@@ -733,11 +731,11 @@ public:
     }
     return __libcpp_thread_id_equal(__x.__id_, __y.__id_);
   }
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(__thread_id __x, __thread_id __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator!=(__thread_id __x, __thread_id __y) noexcept
   {
     return !(__x == __y);
   }
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator<(__thread_id __x, __thread_id __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator<(__thread_id __x, __thread_id __y) noexcept
   { // id==0 is always less than any other thread_id
     if (__x.__id_ == 0)
     {
@@ -749,44 +747,44 @@ public:
     }
     return __libcpp_thread_id_less(__x.__id_, __y.__id_);
   }
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator<=(__thread_id __x, __thread_id __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator<=(__thread_id __x, __thread_id __y) noexcept
   {
     return !(__y < __x);
   }
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator>(__thread_id __x, __thread_id __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator>(__thread_id __x, __thread_id __y) noexcept
   {
     return __y < __x;
   }
-  friend _LIBCUDACXX_INLINE_VISIBILITY bool operator>=(__thread_id __x, __thread_id __y) noexcept
+  friend _LIBCUDACXX_HIDE_FROM_ABI bool operator>=(__thread_id __x, __thread_id __y) noexcept
   {
     return !(__x < __y);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void __reset()
+  _LIBCUDACXX_HIDE_FROM_ABI void __reset()
   {
     __id_ = 0;
   }
 
 #    ifndef __cuda_std__
   template <class _CharT, class _Traits>
-  friend _LIBCUDACXX_INLINE_VISIBILITY basic_ostream<_CharT, _Traits>&
+  friend _LIBCUDACXX_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
   operator<<(basic_ostream<_CharT, _Traits>& __os, __thread_id __id);
 #    endif
 
 private:
-  _LIBCUDACXX_INLINE_VISIBILITY __thread_id(__libcpp_thread_id __id)
+  _LIBCUDACXX_HIDE_FROM_ABI __thread_id(__libcpp_thread_id __id)
       : __id_(__id)
   {}
 
   friend __thread_id this_thread::get_id() noexcept;
-  friend class _LIBCUDACXX_TYPE_VIS thread;
-  friend struct _LIBCUDACXX_TEMPLATE_VIS hash<__thread_id>;
+  friend class _CCCL_TYPE_VISIBILITY_DEFAULT thread;
+  friend struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<__thread_id>;
 };
 
 namespace this_thread
 {
 
-inline _LIBCUDACXX_INLINE_VISIBILITY __thread_id get_id() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI __thread_id get_id() noexcept
 {
   return __libcpp_thread_get_current_id();
 }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__verbose_abort b/libcudacxx/include/cuda/std/detail/libcxx/include/__verbose_abort
index 3449b478d0..b44c0506ef 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__verbose_abort
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__verbose_abort
@@ -37,7 +37,7 @@ extern "C" void abort();
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 _CCCL_NORETURN _LIBCUDACXX_ATTRIBUTE_FORMAT(__printf__, 1, 2)
-  _LIBCUDACXX_HIDE_FROM_ABI inline void __libcpp_verbose_abort(const char*, ...)
+  _LIBCUDACXX_HIDE_FROM_ABI void __libcpp_verbose_abort(const char*, ...)
 {
   ::abort();
   __builtin_unreachable(); // never reached, but needed to tell the compiler that the function never returns
@@ -49,8 +49,8 @@ _LIBCUDACXX_END_NAMESPACE_STD
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_CCCL_NORETURN _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
-_LIBCUDACXX_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...);
+_CCCL_NORETURN
+_CCCL_HOST_DEVICE _LIBCUDACXX_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...);
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm b/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm
index 4906acde36..3641e69bd5 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm
@@ -772,20 +772,20 @@ private:
   _Predicate __p_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __invert() {}
+  _LIBCUDACXX_HIDE_FROM_ABI __invert() {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __invert(_Predicate __p)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __invert(_Predicate __p)
       : __p_(__p)
   {}
 
   template <class _T1>
-  _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _T1& __x)
+  _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _T1& __x)
   {
     return !__p_(__x);
   }
 
   template <class _T1, class _T2>
-  _LIBCUDACXX_INLINE_VISIBILITY bool operator()(const _T1& __x, const _T2& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI bool operator()(const _T1& __x, const _T2& __y)
   {
     return __p_(__y, __x);
   }
@@ -1087,7 +1087,7 @@ typename uniform_int_distribution<_IntType>::result_type uniform_int_distributio
 }
 
 template <class _PopulationIterator, class _SampleIterator, class _Distance, class _UniformRandomNumberGenerator>
-_LIBCUDACXX_INLINE_VISIBILITY _SampleIterator __sample(
+_LIBCUDACXX_HIDE_FROM_ABI _SampleIterator __sample(
   _PopulationIterator __first,
   _PopulationIterator __last,
   _SampleIterator __output_iter,
@@ -1113,7 +1113,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _SampleIterator __sample(
 }
 
 template <class _PopulationIterator, class _SampleIterator, class _Distance, class _UniformRandomNumberGenerator>
-_LIBCUDACXX_INLINE_VISIBILITY _SampleIterator __sample(
+_LIBCUDACXX_HIDE_FROM_ABI _SampleIterator __sample(
   _PopulationIterator __first,
   _PopulationIterator __last,
   _SampleIterator __output_iter,
@@ -1135,7 +1135,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _SampleIterator __sample(
 }
 
 template <class _PopulationIterator, class _SampleIterator, class _Distance, class _UniformRandomNumberGenerator>
-_LIBCUDACXX_INLINE_VISIBILITY _SampleIterator __sample(
+_LIBCUDACXX_HIDE_FROM_ABI _SampleIterator __sample(
   _PopulationIterator __first,
   _PopulationIterator __last,
   _SampleIterator __output_iter,
@@ -1154,7 +1154,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _SampleIterator __sample(
 
 #  if _CCCL_STD_VER > 2014
 template <class _PopulationIterator, class _SampleIterator, class _Distance, class _UniformRandomNumberGenerator>
-inline _LIBCUDACXX_INLINE_VISIBILITY _SampleIterator sample(
+_LIBCUDACXX_HIDE_FROM_ABI _SampleIterator sample(
   _PopulationIterator __first,
   _PopulationIterator __last,
   _SampleIterator __output_iter,
@@ -1166,7 +1166,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _SampleIterator sample(
 #  endif // _CCCL_STD_VER > 2014
 
 template <class _RandomAccessIterator, class _UniformRandomNumberGenerator>
-_LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last, _UniformRandomNumberGenerator&& __g)
 {
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
@@ -1483,7 +1483,7 @@ _CCCL_HOST_DEVICE _BidirectionalIterator __stable_partition(
 }
 
 template <class _ForwardIterator, class _Predicate>
-inline _LIBCUDACXX_INLINE_VISIBILITY _ForwardIterator
+_LIBCUDACXX_HIDE_FROM_ABI _ForwardIterator
 stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
   return __stable_partition<__add_lvalue_reference_t<_Predicate>>(
@@ -1558,7 +1558,7 @@ __sort4(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3, _Fo
 // stable, 4-10 compares, 0-9 swaps
 
 template <class _Compare, class _ForwardIterator>
-_LIBCUDACXX_HIDDEN _CCCL_HOST_DEVICE unsigned __sort5(
+_CCCL_VISIBILITY_HIDDEN _CCCL_HOST_DEVICE unsigned __sort5(
   _ForwardIterator __x1,
   _ForwardIterator __x2,
   _ForwardIterator __x3,
@@ -1954,92 +1954,90 @@ _CCCL_HOST_DEVICE void __sort(_RandomAccessIterator __first, _RandomAccessIterat
 
 // This forwarder keeps the top call and the recursive calls using the same instantiation, forcing a reference _Compare
 template <class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
+_LIBCUDACXX_HIDE_FROM_ABI void sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
   using _Comp_ref = __comp_ref_type<_Compare>;
   _CUDA_VSTD::__sort<_Comp_ref>(__first, __last, _Comp_ref(__comp));
 }
 
 template <class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY void sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
+_LIBCUDACXX_HIDE_FROM_ABI void sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
 {
   _CUDA_VSTD::sort(__first, __last, __less{});
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY void sort(_Tp** __first, _Tp** __last)
+_LIBCUDACXX_HIDE_FROM_ABI void sort(_Tp** __first, _Tp** __last)
 {
   _CUDA_VSTD::sort((size_t*) __first, (size_t*) __last, __less{});
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY void sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last)
+_LIBCUDACXX_HIDE_FROM_ABI void sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last)
 {
   _CUDA_VSTD::sort(__first.base(), __last.base());
 }
 
 template <class _Tp, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY void sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last, _Compare __comp)
+_LIBCUDACXX_HIDE_FROM_ABI void sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last, _Compare __comp)
 {
   typedef __add_lvalue_reference_t<_Compare> _Comp_ref;
   _CUDA_VSTD::sort<_Tp*, _Comp_ref>(__first.base(), __last.base(), __comp);
 }
 
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, char*>(char*, char*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, signed*>(signed*, signed*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, char*>(char*, char*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, signed*>(signed*, signed*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, short*>(short*, short*, __less&))
+  _LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, short*>(short*, short*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, int*>(int*, int*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, unsigned*>(unsigned*, unsigned*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, long*>(long*, long*, __less&))
+  _LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, int*>(int*, int*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned*>(unsigned*, unsigned*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, long*>(long*, long*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, long long*>(long long*, long long*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, unsigned long long*>(
+  _LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, long long*>(long long*, long long*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, unsigned long long*>(
   unsigned long long*, unsigned long long*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, float*>(float*, float*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, double*>(double*, double*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, float*>(float*, float*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, double*>(double*, double*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY void __sort<__less&, long double*>(long double*, long double*, __less&))
+  _LIBCUDACXX_HIDE_FROM_ABI void __sort<__less&, long double*>(long double*, long double*, __less&))
 
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, char*>(char*, char*, __less&))
+  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, char*>(char*, char*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, signed char*>(
+  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, signed char*>(
   signed char*, signed char*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, unsigned char*>(
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned char*>(
   unsigned char*, unsigned char*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, short*>(short*, short*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, unsigned short*>(
+  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, short*>(short*, short*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned short*>(
   unsigned short*, unsigned short*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, int*>(int*, int*, __less&))
+  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, int*>(int*, int*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, unsigned*>(unsigned*, unsigned*, __less&))
+  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned*>(unsigned*, unsigned*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, long*>(long*, long*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, unsigned long*>(
+  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, long*>(long*, long*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned long*>(
   unsigned long*, unsigned long*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, long long*>(long long*, long long*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, unsigned long long*>(
+  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, long long*>(long long*, long long*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, unsigned long long*>(
   unsigned long long*, unsigned long long*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, float*>(float*, float*, __less&))
+  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, float*>(float*, float*, __less&))
 _LIBCUDACXX_EXTERN_TEMPLATE(
-  _LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, double*>(double*, double*, __less&))
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY bool __insertion_sort_incomplete<__less&, long double*>(
+  _LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, double*>(double*, double*, __less&))
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI bool __insertion_sort_incomplete<__less&, long double*>(
   long double*, long double*, __less&))
 
-_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_INLINE_VISIBILITY unsigned __sort5<__less&, long double*>(
+_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_HIDE_FROM_ABI unsigned __sort5<__less&, long double*>(
   long double*, long double*, long double*, long double*, long double*, __less&))
 
 // inplace_merge
@@ -2210,7 +2208,7 @@ _CCCL_HOST_DEVICE void __inplace_merge(
 }
 
 template <class _BidirectionalIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY void inplace_merge(
+_LIBCUDACXX_HIDE_FROM_ABI void inplace_merge(
   _BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last, _Compare __comp)
 {
   typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
@@ -2226,7 +2224,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY void inplace_merge(
 }
 
 template <class _BidirectionalIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last)
 {
   _CUDA_VSTD::inplace_merge(__first, __middle, __last, __less{});
@@ -2432,8 +2430,7 @@ _CCCL_HOST_DEVICE void __stable_sort(
 }
 
 template <class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
+_LIBCUDACXX_HIDE_FROM_ABI void stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
 {
   typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
@@ -2450,7 +2447,7 @@ stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compar
 }
 
 template <class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY void stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
+_LIBCUDACXX_HIDE_FROM_ABI void stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
 {
   _CUDA_VSTD::stable_sort(__first, __last, __less{});
 }
@@ -2670,7 +2667,7 @@ __nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _Rando
 }
 
 template <class _RandomAccessIterator, class _Compare>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last, _Compare __comp)
 {
   using _Comp_ref = __comp_ref_type<_Compare>;
@@ -2678,7 +2675,7 @@ nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomA
 }
 
 template <class _RandomAccessIterator>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last)
 {
   _CUDA_VSTD::nth_element(__first, __nth, __last, __less{});
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/array b/libcudacxx/include/cuda/std/detail/libcxx/include/array
index bb9207369d..61afff0000 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/array
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/array
@@ -167,7 +167,7 @@ template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexce
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp, size_t _Size>
-struct _LIBCUDACXX_TEMPLATE_VIS array
+struct _CCCL_TYPE_VISIBILITY_DEFAULT array
 {
   // types:
   typedef array __self;
@@ -186,101 +186,95 @@ struct _LIBCUDACXX_TEMPLATE_VIS array
   _Tp __elems_[_Size];
 
   // No explicit construct/copy/destroy for aggregate type
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void fill(const value_type& __u)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void fill(const value_type& __u)
   {
     _CUDA_VSTD::fill_n(__elems_, _Size, __u);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void swap(array& __a) noexcept(__is_nothrow_swappable<_Tp>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void swap(array& __a) noexcept(__is_nothrow_swappable<_Tp>::value)
   {
     _CUDA_VSTD::swap_ranges(__elems_, __elems_ + _Size, __a.data());
   }
 
   // iterators:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator begin() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator begin() noexcept
   {
     return iterator(__elems_);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator begin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator begin() const noexcept
   {
     return const_iterator(__elems_);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator end() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator end() noexcept
   {
     return iterator(__elems_ + _Size);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator end() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator end() const noexcept
   {
     return const_iterator(__elems_ + _Size);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 reverse_iterator rbegin() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 reverse_iterator rbegin() noexcept
   {
     return reverse_iterator(end());
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 const_reverse_iterator
-  rbegin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 const_reverse_iterator rbegin() const noexcept
   {
     return const_reverse_iterator(end());
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 reverse_iterator rend() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 reverse_iterator rend() noexcept
   {
     return reverse_iterator(begin());
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 const_reverse_iterator
-  rend() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 const_reverse_iterator rend() const noexcept
   {
     return const_reverse_iterator(begin());
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator cbegin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator cbegin() const noexcept
   {
     return begin();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator cend() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator cend() const noexcept
   {
     return end();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 const_reverse_iterator
-  crbegin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 const_reverse_iterator crbegin() const noexcept
   {
     return rbegin();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 const_reverse_iterator
-  crend() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 const_reverse_iterator crend() const noexcept
   {
     return rend();
   }
 
   // capacity:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr size_type size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type size() const noexcept
   {
     return _Size;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr size_type max_size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type max_size() const noexcept
   {
     return _Size;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr bool empty() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const noexcept
   {
     return _Size == 0;
   }
 
   // element access:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference
-  operator[](size_type __n) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator[](size_type __n) noexcept
   {
     _LIBCUDACXX_ASSERT(__n < _Size, "out-of-bounds access in std::array<T, N>");
     return __elems_[__n];
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_reference
-  operator[](size_type __n) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_reference operator[](size_type __n) const noexcept
   {
     _LIBCUDACXX_ASSERT(__n < _Size, "out-of-bounds access in std::array<T, N>");
     return __elems_[__n];
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference at(size_type __n)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference at(size_type __n)
   {
     if (__n >= _Size)
     {
@@ -289,7 +283,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS array
     return __elems_[__n];
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_reference at(size_type __n) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_reference at(size_type __n) const
   {
     if (__n >= _Size)
     {
@@ -298,35 +292,35 @@ struct _LIBCUDACXX_TEMPLATE_VIS array
     return __elems_[__n];
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference front() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference front() noexcept
   {
     return (*this)[0];
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_reference front() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_reference front() const noexcept
   {
     return (*this)[0];
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference back() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference back() noexcept
   {
     return (*this)[_Size - 1];
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_reference back() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_reference back() const noexcept
   {
     return (*this)[_Size - 1];
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 value_type* data() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 value_type* data() noexcept
   {
     return __elems_;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const value_type* data() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const value_type* data() const noexcept
   {
     return __elems_;
   }
 };
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS array<_Tp, 0>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT array<_Tp, 0>
 {
   // types:
   typedef array __self;
@@ -350,92 +344,88 @@ struct _LIBCUDACXX_TEMPLATE_VIS array<_Tp, 0>
   };
   _CCCL_ALIGNAS_TYPE(_ArrayInStructT) _CharType __elems_[sizeof(_ArrayInStructT)];
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 value_type* data() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 value_type* data() noexcept
   {
     return nullptr;
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const value_type* data() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const value_type* data() const noexcept
   {
     return nullptr;
   }
 
   // No explicit construct/copy/destroy for aggregate type
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void fill(const value_type&)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void fill(const value_type&)
   {
     static_assert(!is_const<_Tp>::value, "cannot fill zero-sized array of type 'const T'");
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void swap(array&) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void swap(array&) noexcept
   {
     static_assert(!is_const<_Tp>::value, "cannot swap zero-sized array of type 'const T'");
   }
 
   // iterators:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator begin() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator begin() noexcept
   {
     return iterator(nullptr);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator begin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator begin() const noexcept
   {
     return const_iterator(nullptr);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 iterator end() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 iterator end() noexcept
   {
     return iterator(nullptr);
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator end() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator end() const noexcept
   {
     return const_iterator(nullptr);
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 reverse_iterator rbegin() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 reverse_iterator rbegin() noexcept
   {
     return reverse_iterator(end());
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 const_reverse_iterator
-  rbegin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 const_reverse_iterator rbegin() const noexcept
   {
     return const_reverse_iterator(end());
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 reverse_iterator rend() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 reverse_iterator rend() noexcept
   {
     return reverse_iterator(begin());
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 const_reverse_iterator
-  rend() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 const_reverse_iterator rend() const noexcept
   {
     return const_reverse_iterator(begin());
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator cbegin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator cbegin() const noexcept
   {
     return begin();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_iterator cend() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_iterator cend() const noexcept
   {
     return end();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 const_reverse_iterator
-  crbegin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 const_reverse_iterator crbegin() const noexcept
   {
     return rbegin();
   }
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 const_reverse_iterator
-  crend() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 const_reverse_iterator crend() const noexcept
   {
     return rend();
   }
 
   // capacity:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_type size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type size() const noexcept
   {
     return 0;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_type max_size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type max_size() const noexcept
   {
     return 0;
   }
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool empty() const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const noexcept
   {
     return true;
   }
@@ -443,55 +433,54 @@ struct _LIBCUDACXX_TEMPLATE_VIS array<_Tp, 0>
   _CCCL_DIAG_PUSH
   _CCCL_DIAG_SUPPRESS_MSVC(4702) // Unreachable code
   // element access:
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference operator[](size_type) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference operator[](size_type) noexcept
   {
     _LIBCUDACXX_ASSERT(false, "cannot call array<T, 0>::operator[] on a zero-sized array");
     _LIBCUDACXX_UNREACHABLE();
     return *data();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_reference
-  operator[](size_type) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_reference operator[](size_type) const noexcept
   {
     _LIBCUDACXX_ASSERT(false, "cannot call array<T, 0>::operator[] on a zero-sized array");
     _LIBCUDACXX_UNREACHABLE();
     return *data();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference at(size_type)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference at(size_type)
   {
     _CUDA_VSTD::__throw_out_of_range("array<T, 0>::at");
     _LIBCUDACXX_UNREACHABLE();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_reference at(size_type) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_reference at(size_type) const
   {
     _CUDA_VSTD::__throw_out_of_range("array<T, 0>::at");
     _LIBCUDACXX_UNREACHABLE();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference front() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference front() noexcept
   {
     _LIBCUDACXX_ASSERT(false, "cannot call array<T, 0>::front() on a zero-sized array");
     _LIBCUDACXX_UNREACHABLE();
     return *data();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_reference front() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_reference front() const noexcept
   {
     _LIBCUDACXX_ASSERT(false, "cannot call array<T, 0>::front() on a zero-sized array");
     _LIBCUDACXX_UNREACHABLE();
     return *data();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 reference back() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 reference back() noexcept
   {
     _LIBCUDACXX_ASSERT(false, "cannot call array<T, 0>::back() on a zero-sized array");
     _LIBCUDACXX_UNREACHABLE();
     return *data();
   }
 
-  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const_reference back() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const_reference back() const noexcept
   {
     _LIBCUDACXX_ASSERT(false, "cannot call array<T, 0>::back() on a zero-sized array");
     _LIBCUDACXX_UNREACHABLE();
@@ -506,88 +495,88 @@ _CCCL_HOST_DEVICE array(_Tp, _Args...) -> array<_Tp, 1 + sizeof...(_Args)>;
 #endif // _CCCL_STD_VER >= 2017
 
 template <class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator==(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y)
 {
   return _CUDA_VSTD::equal(__x.begin(), __x.end(), __y.begin());
 }
 
 template <class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator!=(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y)
 {
   return !(__x == __y);
 }
 
 template <class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y)
 {
   return _CUDA_VSTD::lexicographical_compare(__x.begin(), __x.end(), __y.begin(), __y.end());
 }
 
 template <class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y)
 {
   return __y < __x;
 }
 
 template <class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<=(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y)
 {
   return !(__y < __x);
 }
 
 template <class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>=(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y)
 {
   return !(__x < __y);
 }
 
 template <class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __enable_if_t<_Size == 0 || __is_swappable<_Tp>::value, void>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __enable_if_t<_Size == 0 || __is_swappable<_Tp>::value, void>
 swap(array<_Tp, _Size>& __x, array<_Tp, _Size>& __y) noexcept(noexcept(__x.swap(__y)))
 {
   __x.swap(__y);
 }
 
 template <class _Tp, size_t _Size>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_size<array<_Tp, _Size>> : public integral_constant<size_t, _Size>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_size<array<_Tp, _Size>> : public integral_constant<size_t, _Size>
 {};
 
 template <size_t _Ip, class _Tp, size_t _Size>
-struct _LIBCUDACXX_TEMPLATE_VIS tuple_element<_Ip, array<_Tp, _Size>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT tuple_element<_Ip, array<_Tp, _Size>>
 {
   static_assert(_Ip < _Size, "Index out of bounds in std::tuple_element<> (std::array)");
   typedef _Tp type;
 };
 
 template <size_t _Ip, class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp& get(array<_Tp, _Size>& __a) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp& get(array<_Tp, _Size>& __a) noexcept
 {
   static_assert(_Ip < _Size, "Index out of bounds in std::get<> (std::array)");
   return __a.__elems_[_Ip];
 }
 
 template <size_t _Ip, class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp& get(const array<_Tp, _Size>& __a) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp& get(const array<_Tp, _Size>& __a) noexcept
 {
   static_assert(_Ip < _Size, "Index out of bounds in std::get<> (const std::array)");
   return __a.__elems_[_Ip];
 }
 
 template <size_t _Ip, class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp&& get(array<_Tp, _Size>&& __a) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp&& get(array<_Tp, _Size>&& __a) noexcept
 {
   static_assert(_Ip < _Size, "Index out of bounds in std::get<> (std::array &&)");
   return _CUDA_VSTD::move(__a.__elems_[_Ip]);
 }
 
 template <size_t _Ip, class _Tp, size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp&& get(const array<_Tp, _Size>&& __a) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&& get(const array<_Tp, _Size>&& __a) noexcept
 {
   static_assert(_Ip < _Size, "Index out of bounds in std::get<> (const std::array &&)");
   return _CUDA_VSTD::move(__a.__elems_[_Ip]);
@@ -596,21 +585,21 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Tp&& get(const
 #if _CCCL_STD_VER >= 2014
 
 template <typename _Tp, size_t _Size, size_t... _Index>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 array<remove_cv_t<_Tp>, _Size>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 array<remove_cv_t<_Tp>, _Size>
 __to_array_lvalue_impl(_Tp (&__arr)[_Size], index_sequence<_Index...>)
 {
   return {{__arr[_Index]...}};
 }
 
 template <typename _Tp, size_t _Size, size_t... _Index>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 array<remove_cv_t<_Tp>, _Size>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 array<remove_cv_t<_Tp>, _Size>
 __to_array_rvalue_impl(_Tp (&&__arr)[_Size], index_sequence<_Index...>)
 {
   return {{_CUDA_VSTD::move(__arr[_Index])...}};
 }
 
 template <typename _Tp, size_t _Size>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 array<remove_cv_t<_Tp>, _Size>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 array<remove_cv_t<_Tp>, _Size>
 to_array(_Tp (&__arr)[_Size]) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Tp&))
 {
   static_assert(!_CCCL_TRAIT(is_array, _Tp), "[array.creation]/1: to_array does not accept multidimensional arrays.");
@@ -620,7 +609,7 @@ to_array(_Tp (&__arr)[_Size]) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Tp
 }
 
 template <typename _Tp, size_t _Size>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr array<remove_cv_t<_Tp>, _Size>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr array<remove_cv_t<_Tp>, _Size>
 to_array(_Tp (&&__arr)[_Size]) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
 {
   static_assert(!_CCCL_TRAIT(is_array, _Tp), "[array.creation]/4: to_array does not accept multidimensional arrays.");
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index 58e0e2d240..a3102d6087 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -53,12 +53,12 @@ namespace std
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__new_>
 #include <cuda/std/atomic>
 #include <cuda/std/chrono>
 #include <cuda/std/cstddef>
 #include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 #include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/detail/libcxx/include/new>
 
 _CCCL_PUSH_MACROS
 
@@ -70,7 +70,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 struct __empty_completion
 {
-  inline _LIBCUDACXX_INLINE_VISIBILITY void operator()() noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI void operator()() noexcept {}
 };
 
 #ifndef _LIBCUDACXX_HAS_NO_TREE_BARRIER
@@ -94,7 +94,7 @@ class alignas(64) __barrier_base
   };
   ::std::vector<__state_t> __state;
 
-  inline _LIBCUDACXX_INLINE_VISIBILITY bool __arrive(__phase_t const __old_phase)
+  _LIBCUDACXX_HIDE_FROM_ABI bool __arrive(__phase_t const __old_phase)
   {
     __phase_t const __half_step = __old_phase + 1, __full_step = __old_phase + 2;
 #  ifndef _LIBCUDACXX_HAS_NO_THREAD_FAVORITE_BARRIER_INDEX
@@ -158,7 +158,7 @@ class alignas(64) __barrier_base
 public:
   using arrival_token = __phase_t;
 
-  inline _LIBCUDACXX_INLINE_VISIBILITY __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
+  _LIBCUDACXX_HIDE_FROM_ABI __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
       : __expected(__expected)
       , __expected_adjustment(0)
       , __completion(__completion)
@@ -168,12 +168,12 @@ public:
     _LIBCUDACXX_ASSERT(__expected >= 0, "");
   }
 
-  inline _LIBCUDACXX_INLINE_VISIBILITY ~__barrier_base() = default;
+  _CCCL_HIDE_FROM_ABI ~__barrier_base() = default;
 
   __barrier_base(__barrier_base const&)            = delete;
   __barrier_base& operator=(__barrier_base const&) = delete;
 
-  _CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(ptrdiff_t update = 1)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t update = 1)
   {
     _LIBCUDACXX_ASSERT(update > 0, "");
     auto __old_phase = __phase.load(memory_order_relaxed);
@@ -189,17 +189,17 @@ public:
     }
     return __old_phase;
   }
-  inline _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __old_phase) const
+  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __old_phase) const
   {
     __libcpp_thread_poll_with_backoff([=]() -> bool {
       return __phase.load(memory_order_acquire) != __old_phase;
     });
   }
-  inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait()
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
   {
     wait(arrive());
   }
-  inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop()
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
   {
     __expected_adjustment.fetch_sub(1, memory_order_relaxed);
     (void) arrive();
@@ -221,13 +221,13 @@ class __barrier_poll_tester_phase
   typename _Barrier::arrival_token __phase;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   __barrier_poll_tester_phase(_Barrier const* __this_, typename _Barrier::arrival_token&& __phase_)
       : __this(__this_)
       , __phase(_CUDA_VSTD::move(__phase_))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const
+  _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const
   {
     return __this->__try_wait(__phase);
   }
@@ -240,25 +240,25 @@ class __barrier_poll_tester_parity
   bool __parity;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY __barrier_poll_tester_parity(_Barrier const* __this_, bool __parity_)
+  _LIBCUDACXX_HIDE_FROM_ABI __barrier_poll_tester_parity(_Barrier const* __this_, bool __parity_)
       : __this(__this_)
       , __parity(__parity_)
   {}
 
-  inline _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const
+  _LIBCUDACXX_HIDE_FROM_ABI bool operator()() const
   {
     return __this->__try_wait_parity(__parity);
   }
 };
 
 template <class _Barrier>
-_LIBCUDACXX_INLINE_VISIBILITY bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase)
+_LIBCUDACXX_HIDE_FROM_ABI bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase)
 {
   return __b.__try_wait(_CUDA_VSTD::move(__phase));
 }
 
 template <class _Barrier>
-_LIBCUDACXX_INLINE_VISIBILITY bool __call_try_wait_parity(const _Barrier& __b, bool __parity)
+_LIBCUDACXX_HIDE_FROM_ABI bool __call_try_wait_parity(const _Barrier& __b, bool __parity)
 {
   return __b.__try_wait_parity(__parity);
 }
@@ -279,36 +279,35 @@ private:
   template <typename _Barrier>
   friend class __barrier_poll_tester_parity;
   template <typename _Barrier>
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool
-  __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase);
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase);
   template <typename _Barrier>
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity);
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity);
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait(arrival_token __old) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const
   {
     return __phase.load(memory_order_acquire) != __old;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait_parity(bool __parity) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const
   {
     return __try_wait(__parity);
   }
 
 public:
-  __barrier_base() = default;
+  _CCCL_HIDE_FROM_ABI __barrier_base() = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
+  _LIBCUDACXX_HIDE_FROM_ABI __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
       : __expected(__expected)
       , __arrived(__expected)
       , __completion(__completion)
       , __phase(false)
   {}
 
-  ~__barrier_base() = default;
+  _CCCL_HIDE_FROM_ABI ~__barrier_base() = default;
 
   __barrier_base(__barrier_base const&)            = delete;
   __barrier_base& operator=(__barrier_base const&) = delete;
 
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(ptrdiff_t __update = 1)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1)
   {
     auto const __old_phase    = __phase.load(memory_order_relaxed);
     auto const __result       = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update;
@@ -325,21 +324,21 @@ public:
     }
     return __old_phase;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __old_phase) const
+  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __old_phase) const
   {
     __phase.wait(__old_phase, memory_order_acquire);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait()
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
   {
     wait(arrive());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop()
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
   {
     __expected.fetch_sub(1, memory_order_relaxed);
     (void) arrive();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
   {
     return numeric_limits<ptrdiff_t>::max();
   }
@@ -365,12 +364,11 @@ private:
   template <typename _Barrier>
   friend class __barrier_poll_tester_parity;
   template <typename _Barrier>
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool
-  __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase);
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait(const _Barrier& __b, typename _Barrier::arrival_token&& __phase);
   template <typename _Barrier>
-  _LIBCUDACXX_INLINE_VISIBILITY friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity);
+  _LIBCUDACXX_HIDE_FROM_ABI friend bool __call_try_wait_parity(const _Barrier& __b, bool __parity);
 
-  static _LIBCUDACXX_INLINE_VISIBILITY constexpr uint64_t __init(ptrdiff_t __count) noexcept
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr uint64_t __init(ptrdiff_t __count) noexcept
   {
 #  if _CCCL_STD_VER > 2011
     // This debug assert is not supported in C++11 due to resulting in a
@@ -379,36 +377,36 @@ private:
 #  endif // _CCCL_STD_VER > 2011
     return (((1u << 31) - __count) << 32) | ((1u << 31) - __count);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait_phase(uint64_t __phase) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_phase(uint64_t __phase) const
   {
     uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
     return ((__current & __phase_bit) != __phase);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait(arrival_token __old) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait(arrival_token __old) const
   {
     return __try_wait_phase(__old & __phase_bit);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait_parity(bool __parity) const
+  _LIBCUDACXX_HIDE_FROM_ABI bool __try_wait_parity(bool __parity) const
   {
     return __try_wait_phase(__parity ? __phase_bit : 0);
   }
 
 public:
-  __barrier_base() = default;
+  _CCCL_HIDE_FROM_ABI __barrier_base() = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion())
       : __phase_arrived_expected(__init(__count))
   {
     _LIBCUDACXX_DEBUG_ASSERT(__count >= 0, "");
   }
 
-  ~__barrier_base() = default;
+  _CCCL_HIDE_FROM_ABI ~__barrier_base() = default;
 
   __barrier_base(__barrier_base const&)            = delete;
   __barrier_base& operator=(__barrier_base const&) = delete;
 
-  _CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(ptrdiff_t __update = 1)
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1)
   {
     auto const __inc = __arrived_unit * __update;
     auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel);
@@ -419,25 +417,25 @@ public:
     }
     return __old & __phase_bit;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __phase) const
+  _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const
   {
     __libcpp_thread_poll_with_backoff(__barrier_poll_tester_phase<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void wait_parity(bool __parity) const
+  _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __parity) const
   {
     __libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait()
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
   {
     wait(arrive());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop()
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_drop()
   {
     __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);
     (void) arrive();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
   {
     return numeric_limits<int32_t>::max();
   }
@@ -449,7 +447,7 @@ template <class _CompletionF = __empty_completion>
 class barrier : public __barrier_base<_CompletionF>
 {
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF())
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF())
       : __barrier_base<_CompletionF>(__count, __completion)
   {}
 };
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cassert b/libcudacxx/include/cuda/std/detail/libcxx/include/cassert
deleted file mode 100644
index 0a66560dc3..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cassert
+++ /dev/null
@@ -1,28 +0,0 @@
-// -*- C++ -*-
-//===-------------------------- cassert -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-/*
-    cassert synopsis
-
-Macros:
-
-    assert
-
-*/
-
-#ifndef _LIBCUDACXX_CASSERT
-#define _LIBCUDACXX_CASSERT
-
-#ifndef _CCCL_COMPILER_NVRTC
-#  include <cassert>
-
-#  include <assert.h>
-#endif //_CCCL_COMPILER_NVRTC
-
-#endif // _LIBCUDACXX_CASSERT
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/ccomplex b/libcudacxx/include/cuda/std/detail/libcxx/include/ccomplex
deleted file mode 100644
index 487b6d5e7a..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/ccomplex
+++ /dev/null
@@ -1,22 +0,0 @@
-// -*- C++ -*-
-//===--------------------------- ccomplex ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_CCOMPLEX
-#define _LIBCUDACXX_CCOMPLEX
-
-/*
-    ccomplex synopsis
-
-#include <complex>
-
-*/
-
-#include <complex>
-
-#endif // _LIBCUDACXX_CCOMPLEX
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cfloat b/libcudacxx/include/cuda/std/detail/libcxx/include/cfloat
deleted file mode 100644
index 9480e45cdf..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cfloat
+++ /dev/null
@@ -1,78 +0,0 @@
-// -*- C++ -*-
-//===--------------------------- cfloat -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_CFLOAT
-#define _LIBCUDACXX_CFLOAT
-
-/*
-    cfloat synopsis
-
-Macros:
-
-    FLT_ROUNDS
-    FLT_EVAL_METHOD     // C99
-    FLT_RADIX
-
-    FLT_HAS_SUBNORM     // C11
-    DBL_HAS_SUBNORM     // C11
-    LDBL_HAS_SUBNORM    // C11
-
-    FLT_MANT_DIG
-    DBL_MANT_DIG
-    LDBL_MANT_DIG
-
-    DECIMAL_DIG         // C99
-    FLT_DECIMAL_DIG     // C11
-    DBL_DECIMAL_DIG     // C11
-    LDBL_DECIMAL_DIG    // C11
-
-    FLT_DIG
-    DBL_DIG
-    LDBL_DIG
-
-    FLT_MIN_EXP
-    DBL_MIN_EXP
-    LDBL_MIN_EXP
-
-    FLT_MIN_10_EXP
-    DBL_MIN_10_EXP
-    LDBL_MIN_10_EXP
-
-    FLT_MAX_EXP
-    DBL_MAX_EXP
-    LDBL_MAX_EXP
-
-    FLT_MAX_10_EXP
-    DBL_MAX_10_EXP
-    LDBL_MAX_10_EXP
-
-    FLT_MAX
-    DBL_MAX
-    LDBL_MAX
-
-    FLT_EPSILON
-    DBL_EPSILON
-    LDBL_EPSILON
-
-    FLT_MIN
-    DBL_MIN
-    LDBL_MIN
-
-    FLT_TRUE_MIN        // C11
-    DBL_TRUE_MIN        // C11
-    LDBL_TRUE_MIN       // C11
-*/
-
-#ifndef _CCCL_COMPILER_NVRTC
-#  include <cfloat>
-
-#  include <float.h>
-#endif // _CCCL_COMPILER_NVRTC
-
-#endif // _LIBCUDACXX_CFLOAT
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/chrono b/libcudacxx/include/cuda/std/detail/libcxx/include/chrono
index aa47f97bba..5274c69dde 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/chrono
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/chrono
@@ -881,7 +881,7 @@ namespace chrono
 {
 
 template <class _Rep, class _Period = ratio<1>>
-class _LIBCUDACXX_TEMPLATE_VIS duration;
+class _CCCL_TYPE_VISIBILITY_DEFAULT duration;
 
 template <class _Tp>
 struct __is_duration : false_type
@@ -906,7 +906,7 @@ struct __is_duration<const volatile duration<_Rep, _Period>> : true_type
 } // namespace chrono
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-struct _LIBCUDACXX_TEMPLATE_VIS common_type<chrono::duration<_Rep1, _Period1>, chrono::duration<_Rep2, _Period2>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT common_type<chrono::duration<_Rep1, _Period1>, chrono::duration<_Rep2, _Period2>>
 {
   typedef chrono::duration<typename common_type<_Rep1, _Rep2>::type, typename __ratio_gcd<_Period1, _Period2>::type>
     type;
@@ -927,7 +927,7 @@ struct __duration_cast;
 template <class _FromDuration, class _ToDuration, class _Period>
 struct __duration_cast<_FromDuration, _ToDuration, _Period, true, true>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _ToDuration operator()(const _FromDuration& __fd) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _ToDuration operator()(const _FromDuration& __fd) const
   {
     return _ToDuration(static_cast<typename _ToDuration::rep>(__fd.count()));
   }
@@ -936,7 +936,7 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, true, true>
 template <class _FromDuration, class _ToDuration, class _Period>
 struct __duration_cast<_FromDuration, _ToDuration, _Period, true, false>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _ToDuration operator()(const _FromDuration& __fd) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _ToDuration operator()(const _FromDuration& __fd) const
   {
     typedef typename common_type<typename _ToDuration::rep, typename _FromDuration::rep, intmax_t>::type _Ct;
     return _ToDuration(
@@ -947,7 +947,7 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, true, false>
 template <class _FromDuration, class _ToDuration, class _Period>
 struct __duration_cast<_FromDuration, _ToDuration, _Period, false, true>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _ToDuration operator()(const _FromDuration& __fd) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _ToDuration operator()(const _FromDuration& __fd) const
   {
     typedef typename common_type<typename _ToDuration::rep, typename _FromDuration::rep, intmax_t>::type _Ct;
     return _ToDuration(
@@ -958,7 +958,7 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, false, true>
 template <class _FromDuration, class _ToDuration, class _Period>
 struct __duration_cast<_FromDuration, _ToDuration, _Period, false, false>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _ToDuration operator()(const _FromDuration& __fd) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _ToDuration operator()(const _FromDuration& __fd) const
   {
     typedef typename common_type<typename _ToDuration::rep, typename _FromDuration::rep, intmax_t>::type _Ct;
     return _ToDuration(static_cast<typename _ToDuration::rep>(
@@ -967,14 +967,14 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, false, false>
 };
 
 template <class _ToDuration, class _Rep, class _Period>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__is_duration<_ToDuration>::value, _ToDuration>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__is_duration<_ToDuration>::value, _ToDuration>
 duration_cast(const duration<_Rep, _Period>& __fd)
 {
   return __duration_cast<duration<_Rep, _Period>, _ToDuration>()(__fd);
 }
 
 template <class _Rep>
-struct _LIBCUDACXX_TEMPLATE_VIS treat_as_floating_point : is_floating_point<_Rep>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT treat_as_floating_point : is_floating_point<_Rep>
 {};
 
 #if _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
@@ -983,18 +983,18 @@ _LIBCUDACXX_INLINE_VAR constexpr bool treat_as_floating_point_v = treat_as_float
 #endif // _CCCL_STD_VER > 2011 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
 
 template <class _Rep>
-struct _LIBCUDACXX_TEMPLATE_VIS duration_values
+struct _CCCL_TYPE_VISIBILITY_DEFAULT duration_values
 {
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr _Rep zero() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Rep zero() noexcept
   {
     return _Rep(0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr _Rep max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Rep max() noexcept
   {
     return numeric_limits<_Rep>::max();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr _Rep min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Rep min() noexcept
   {
     return numeric_limits<_Rep>::lowest();
   }
@@ -1002,7 +1002,7 @@ public:
 
 #if _CCCL_STD_VER > 2011
 template <class _ToDuration, class _Rep, class _Period>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_duration<_ToDuration>::value, _ToDuration>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_duration<_ToDuration>::value, _ToDuration>
 floor(const duration<_Rep, _Period>& __d)
 {
   _ToDuration __t = duration_cast<_ToDuration>(__d);
@@ -1014,7 +1014,7 @@ floor(const duration<_Rep, _Period>& __d)
 }
 
 template <class _ToDuration, class _Rep, class _Period>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_duration<_ToDuration>::value, _ToDuration>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_duration<_ToDuration>::value, _ToDuration>
 ceil(const duration<_Rep, _Period>& __d)
 {
   _ToDuration __t = duration_cast<_ToDuration>(__d);
@@ -1026,7 +1026,7 @@ ceil(const duration<_Rep, _Period>& __d)
 }
 
 template <class _ToDuration, class _Rep, class _Period>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_duration<_ToDuration>::value, _ToDuration>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __enable_if_t<__is_duration<_ToDuration>::value, _ToDuration>
 round(const duration<_Rep, _Period>& __d)
 {
   _ToDuration __lower = floor<_ToDuration>(__d);
@@ -1048,7 +1048,7 @@ round(const duration<_Rep, _Period>& __d)
 // duration
 
 template <class _Rep, class _Period>
-class _LIBCUDACXX_TEMPLATE_VIS duration
+class _CCCL_TYPE_VISIBILITY_DEFAULT duration
 {
   static_assert(!__is_duration<_Rep>::value, "A duration representation can not be a duration");
   static_assert(__is_ratio<_Period>::value, "Second template parameter of duration must be a std::ratio");
@@ -1091,10 +1091,10 @@ private:
   rep __rep_;
 
 public:
-  constexpr duration() = default;
+  _CCCL_HIDE_FROM_ABI constexpr duration() = default;
 
   template <class _Rep2>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit duration(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit duration(
     const _Rep2& __r,
     __enable_if_t<is_convertible<_Rep2, rep>::value
                   && (treat_as_floating_point<rep>::value || !treat_as_floating_point<_Rep2>::value)>* = 0)
@@ -1103,7 +1103,7 @@ public:
 
   // conversions
   template <class _Rep2, class _Period2>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr duration(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr duration(
     const duration<_Rep2, _Period2>& __d,
     __enable_if_t<__no_overflow<_Period2, period>::value
                   && (treat_as_floating_point<rep>::value
@@ -1114,67 +1114,67 @@ public:
 
   // observer
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr rep count() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr rep count() const
   {
     return __rep_;
   }
 
   // arithmetic
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr typename common_type<duration>::type operator+() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<duration>::type operator+() const
   {
     return typename common_type<duration>::type(*this);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr typename common_type<duration>::type operator-() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<duration>::type operator-() const
   {
     return typename common_type<duration>::type(-__rep_);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration& operator++()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration& operator++()
   {
     ++__rep_;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration operator++(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration operator++(int)
   {
     return duration(__rep_++);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration& operator--()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration& operator--()
   {
     --__rep_;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration operator--(int)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration operator--(int)
   {
     return duration(__rep_--);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration& operator+=(const duration& __d)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration& operator+=(const duration& __d)
   {
     __rep_ += __d.count();
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration& operator-=(const duration& __d)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration& operator-=(const duration& __d)
   {
     __rep_ -= __d.count();
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration& operator*=(const rep& rhs)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration& operator*=(const rep& rhs)
   {
     __rep_ *= rhs;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration& operator/=(const rep& rhs)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration& operator/=(const rep& rhs)
   {
     __rep_ /= rhs;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration& operator%=(const rep& rhs)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration& operator%=(const rep& rhs)
   {
     __rep_ %= rhs;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 duration& operator%=(const duration& rhs)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 duration& operator%=(const duration& rhs)
   {
     __rep_ %= rhs.count();
     return *this;
@@ -1182,15 +1182,15 @@ public:
 
   // special values
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr duration zero() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr duration zero() noexcept
   {
     return duration(duration_values<rep>::zero());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr duration min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr duration min() noexcept
   {
     return duration(duration_values<rep>::min());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr duration max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr duration max() noexcept
   {
     return duration(duration_values<rep>::max());
   }
@@ -1213,7 +1213,7 @@ typedef duration<int, ratio_divide<years::period, ratio<12>>> months;
 template <class _LhsDuration, class _RhsDuration>
 struct __duration_eq
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const
   {
     typedef typename common_type<_LhsDuration, _RhsDuration>::type _Ct;
     return _Ct(__lhs).count() == _Ct(__rhs).count();
@@ -1223,14 +1223,14 @@ struct __duration_eq
 template <class _LhsDuration>
 struct __duration_eq<_LhsDuration, _LhsDuration>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(const _LhsDuration& __lhs, const _LhsDuration& __rhs) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _LhsDuration& __lhs, const _LhsDuration& __rhs) const
   {
     return __lhs.count() == __rhs.count();
   }
 };
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator==(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   return __duration_eq<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>()(__lhs, __rhs);
@@ -1239,7 +1239,7 @@ operator==(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period
 // Duration !=
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator!=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   return !(__lhs == __rhs);
@@ -1250,7 +1250,7 @@ operator!=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period
 template <class _LhsDuration, class _RhsDuration>
 struct __duration_lt
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const
   {
     typedef typename common_type<_LhsDuration, _RhsDuration>::type _Ct;
     return _Ct(__lhs).count() < _Ct(__rhs).count();
@@ -1260,14 +1260,14 @@ struct __duration_lt
 template <class _LhsDuration>
 struct __duration_lt<_LhsDuration, _LhsDuration>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(const _LhsDuration& __lhs, const _LhsDuration& __rhs) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(const _LhsDuration& __lhs, const _LhsDuration& __rhs) const
   {
     return __lhs.count() < __rhs.count();
   }
 };
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator<(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   return __duration_lt<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>()(__lhs, __rhs);
@@ -1276,7 +1276,7 @@ operator<(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2
 // Duration >
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator>(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   return __rhs < __lhs;
@@ -1285,7 +1285,7 @@ operator>(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2
 // Duration <=
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator<=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   return !(__rhs < __lhs);
@@ -1294,7 +1294,7 @@ operator<=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period
 // Duration >=
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator>=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   return !(__lhs < __rhs);
@@ -1303,9 +1303,8 @@ operator>=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period
 // Duration +
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr
-  typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type
-  operator+(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type
+operator+(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   typedef typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type _Cd;
   return _Cd(_Cd(__lhs).count() + _Cd(__rhs).count());
@@ -1314,9 +1313,8 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr
 // Duration -
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr
-  typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type
-  operator-(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type
+operator-(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   typedef typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type _Cd;
   return _Cd(_Cd(__lhs).count() - _Cd(__rhs).count());
@@ -1325,9 +1323,8 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr
 // Duration *
 
 template <class _Rep1, class _Period, class _Rep2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
-  is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value,
-  duration<typename common_type<_Rep1, _Rep2>::type, _Period>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value,
+                                                  duration<typename common_type<_Rep1, _Rep2>::type, _Period>>
 operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s)
 {
   typedef typename common_type<_Rep1, _Rep2>::type _Cr;
@@ -1336,9 +1333,8 @@ operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s)
 }
 
 template <class _Rep1, class _Period, class _Rep2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
-  is_convertible<_Rep1, typename common_type<_Rep1, _Rep2>::type>::value,
-  duration<typename common_type<_Rep1, _Rep2>::type, _Period>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_convertible<_Rep1, typename common_type<_Rep1, _Rep2>::type>::value,
+                                                  duration<typename common_type<_Rep1, _Rep2>::type, _Period>>
 operator*(const _Rep1& __s, const duration<_Rep2, _Period>& __d)
 {
   return __d * __s;
@@ -1347,7 +1343,7 @@ operator*(const _Rep1& __s, const duration<_Rep2, _Period>& __d)
 // Duration /
 
 template <class _Rep1, class _Period, class _Rep2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   !__is_duration<_Rep2>::value && is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value,
   duration<typename common_type<_Rep1, _Rep2>::type, _Period>>
 operator/(const duration<_Rep1, _Period>& __d, const _Rep2& __s)
@@ -1358,7 +1354,7 @@ operator/(const duration<_Rep1, _Period>& __d, const _Rep2& __s)
 }
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr typename common_type<_Rep1, _Rep2>::type
+_LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<_Rep1, _Rep2>::type
 operator/(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   typedef typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type _Ct;
@@ -1368,7 +1364,7 @@ operator/(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2
 // Duration %
 
 template <class _Rep1, class _Period, class _Rep2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   !__is_duration<_Rep2>::value && is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value,
   duration<typename common_type<_Rep1, _Rep2>::type, _Period>>
 operator%(const duration<_Rep1, _Period>& __d, const _Rep2& __s)
@@ -1379,9 +1375,8 @@ operator%(const duration<_Rep1, _Period>& __d, const _Rep2& __s)
 }
 
 template <class _Rep1, class _Period1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr
-  typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type
-  operator%(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type
+operator%(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
   typedef typename common_type<_Rep1, _Rep2>::type _Cr;
   typedef typename common_type<duration<_Rep1, _Period1>, duration<_Rep2, _Period2>>::type _Cd;
@@ -1393,7 +1388,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr
 //////////////////////////////////////////////////////////
 
 template <class _Clock, class _Duration = typename _Clock::duration>
-class _LIBCUDACXX_TEMPLATE_VIS time_point
+class _CCCL_TYPE_VISIBILITY_DEFAULT time_point
 {
   static_assert(__is_duration<_Duration>::value,
                 "Second template parameter of time_point must be a std::chrono::duration");
@@ -1408,35 +1403,35 @@ private:
   duration __d_;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 time_point()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 time_point()
       : __d_(duration::zero())
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit time_point(const duration& __d)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit time_point(const duration& __d)
       : __d_(__d)
   {}
 
   // conversions
   template <class _Duration2>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   time_point(const time_point<clock, _Duration2>& t, __enable_if_t<is_convertible<_Duration2, duration>::value>* = 0)
       : __d_(t.time_since_epoch())
   {}
 
   // observer
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 duration time_since_epoch() const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 duration time_since_epoch() const
   {
     return __d_;
   }
 
   // arithmetic
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 time_point& operator+=(const duration& __d)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 time_point& operator+=(const duration& __d)
   {
     __d_ += __d;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 time_point& operator-=(const duration& __d)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 time_point& operator-=(const duration& __d)
   {
     __d_ -= __d;
     return *this;
@@ -1444,11 +1439,11 @@ public:
 
   // special values
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr time_point min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr time_point min() noexcept
   {
     return time_point(duration::min());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr time_point max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr time_point max() noexcept
   {
     return time_point(duration::max());
   }
@@ -1457,7 +1452,7 @@ public:
 } // namespace chrono
 
 template <class _Clock, class _Duration1, class _Duration2>
-struct _LIBCUDACXX_TEMPLATE_VIS
+struct _CCCL_TYPE_VISIBILITY_DEFAULT
 common_type<chrono::time_point<_Clock, _Duration1>, chrono::time_point<_Clock, _Duration2>>
 {
   typedef chrono::time_point<_Clock, typename common_type<_Duration1, _Duration2>::type> type;
@@ -1467,7 +1462,7 @@ namespace chrono
 {
 
 template <class _ToDuration, class _Clock, class _Duration>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 time_point<_Clock, _ToDuration>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 time_point<_Clock, _ToDuration>
 time_point_cast(const time_point<_Clock, _Duration>& __t)
 {
   return time_point<_Clock, _ToDuration>(_CUDA_VSTD::chrono::duration_cast<_ToDuration>(__t.time_since_epoch()));
@@ -1475,31 +1470,28 @@ time_point_cast(const time_point<_Clock, _Duration>& __t)
 
 #if _CCCL_STD_VER > 2011
 template <class _ToDuration, class _Clock, class _Duration>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__is_duration<_ToDuration>::value,
-                                                             time_point<_Clock, _ToDuration>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__is_duration<_ToDuration>::value, time_point<_Clock, _ToDuration>>
 floor(const time_point<_Clock, _Duration>& __t)
 {
   return time_point<_Clock, _ToDuration>{floor<_ToDuration>(__t.time_since_epoch())};
 }
 
 template <class _ToDuration, class _Clock, class _Duration>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__is_duration<_ToDuration>::value,
-                                                             time_point<_Clock, _ToDuration>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__is_duration<_ToDuration>::value, time_point<_Clock, _ToDuration>>
 ceil(const time_point<_Clock, _Duration>& __t)
 {
   return time_point<_Clock, _ToDuration>{ceil<_ToDuration>(__t.time_since_epoch())};
 }
 
 template <class _ToDuration, class _Clock, class _Duration>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<__is_duration<_ToDuration>::value,
-                                                             time_point<_Clock, _ToDuration>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<__is_duration<_ToDuration>::value, time_point<_Clock, _ToDuration>>
 round(const time_point<_Clock, _Duration>& __t)
 {
   return time_point<_Clock, _ToDuration>{round<_ToDuration>(__t.time_since_epoch())};
 }
 
 template <class _Rep, class _Period>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<numeric_limits<_Rep>::is_signed, duration<_Rep, _Period>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<numeric_limits<_Rep>::is_signed, duration<_Rep, _Period>>
 abs(duration<_Rep, _Period> __d)
 {
   return __d >= __d.zero() ? +__d : -__d;
@@ -1509,7 +1501,7 @@ abs(duration<_Rep, _Period> __d)
 // time_point ==
 
 template <class _Clock, class _Duration1, class _Duration2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator==(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs)
 {
   return __lhs.time_since_epoch() == __rhs.time_since_epoch();
@@ -1518,7 +1510,7 @@ operator==(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock,
 // time_point !=
 
 template <class _Clock, class _Duration1, class _Duration2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator!=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs)
 {
   return !(__lhs == __rhs);
@@ -1527,7 +1519,7 @@ operator!=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock,
 // time_point <
 
 template <class _Clock, class _Duration1, class _Duration2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs)
 {
   return __lhs.time_since_epoch() < __rhs.time_since_epoch();
@@ -1536,7 +1528,7 @@ operator<(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock,
 // time_point >
 
 template <class _Clock, class _Duration1, class _Duration2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs)
 {
   return __rhs < __lhs;
@@ -1545,7 +1537,7 @@ operator>(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock,
 // time_point <=
 
 template <class _Clock, class _Duration1, class _Duration2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs)
 {
   return !(__rhs < __lhs);
@@ -1554,7 +1546,7 @@ operator<=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock,
 // time_point >=
 
 template <class _Clock, class _Duration1, class _Duration2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs)
 {
   return !(__lhs < __rhs);
@@ -1563,7 +1555,7 @@ operator>=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock,
 // time_point operator+(time_point x, duration y);
 
 template <class _Clock, class _Duration1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX14 time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2>>::type>
 operator+(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
@@ -1574,7 +1566,7 @@ operator+(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Pe
 // time_point operator+(duration x, time_point y);
 
 template <class _Rep1, class _Period1, class _Clock, class _Duration2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX14 time_point<_Clock, typename common_type<duration<_Rep1, _Period1>, _Duration2>::type>
 operator+(const duration<_Rep1, _Period1>& __lhs, const time_point<_Clock, _Duration2>& __rhs)
 {
@@ -1584,7 +1576,7 @@ operator+(const duration<_Rep1, _Period1>& __lhs, const time_point<_Clock, _Dura
 // time_point operator-(time_point x, duration y);
 
 template <class _Clock, class _Duration1, class _Rep2, class _Period2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX14 time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2>>::type>
 operator-(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Period2>& __rhs)
 {
@@ -1595,7 +1587,7 @@ operator-(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Pe
 // duration operator-(time_point x, time_point y);
 
 template <class _Clock, class _Duration1, class _Duration2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 typename common_type<_Duration1, _Duration2>::type
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename common_type<_Duration1, _Duration2>::type
 operator-(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs)
 {
   return __lhs.time_since_epoch() - __rhs.time_since_epoch();
@@ -1604,7 +1596,7 @@ operator-(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock,
 //////////////////////////////////////////////////////////
 /////////////////////// clocks ///////////////////////////
 //////////////////////////////////////////////////////////
-class _LIBCUDACXX_TYPE_VIS system_clock
+class _CCCL_TYPE_VISIBILITY_DEFAULT system_clock
 {
 public:
   typedef _LIBCUDACXX_SYS_CLOCK_DURATION duration;
@@ -1613,13 +1605,13 @@ public:
   typedef chrono::time_point<system_clock> time_point;
   static _CCCL_CONSTEXPR_CXX14 const bool is_steady = false;
 
-  _CCCL_HOST_DEVICE static time_point now() noexcept;
-  _CCCL_HOST_DEVICE static time_t to_time_t(const time_point& __t) noexcept;
-  _CCCL_HOST_DEVICE static time_point from_time_t(time_t __t) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static time_point now() noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static time_t to_time_t(const time_point& __t) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static time_point from_time_t(time_t __t) noexcept;
 };
 
 #ifndef _LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK
-class _LIBCUDACXX_TYPE_VIS steady_clock
+class _CCCL_TYPE_VISIBILITY_DEFAULT steady_clock
 {
 public:
   typedef nanoseconds duration;
@@ -1658,7 +1650,7 @@ using local_days    = local_time<days>;
 
 struct last_spec
 {
-  explicit last_spec() = default;
+  _CCCL_HIDE_FROM_ABI explicit last_spec() = default;
 };
 
 class day
@@ -1667,101 +1659,101 @@ private:
   unsigned char __d;
 
 public:
-  day() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY explicit inline constexpr day(unsigned __val) noexcept
+  _CCCL_HIDE_FROM_ABI day() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr day(unsigned __val) noexcept
       : __d(static_cast<unsigned char>(__val))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr day& operator++() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr day& operator++() noexcept
   {
     ++__d;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr day operator++(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr day operator++(int) noexcept
   {
     day __tmp = *this;
     ++(*this);
     return __tmp;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr day& operator--() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr day& operator--() noexcept
   {
     --__d;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr day operator--(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr day operator--(int) noexcept
   {
     day __tmp = *this;
     --(*this);
     return __tmp;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr day& operator+=(const days& __dd) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr day& operator-=(const days& __dd) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY explicit inline constexpr operator unsigned() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr day& operator+=(const days& __dd) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr day& operator-=(const days& __dd) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator unsigned() const noexcept
   {
     return __d;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __d >= 1 && __d <= 31;
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator==(const day& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const day& __lhs, const day& __rhs) noexcept
 {
   return static_cast<unsigned>(__lhs) == static_cast<unsigned>(__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator!=(const day& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const day& __lhs, const day& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<(const day& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const day& __lhs, const day& __rhs) noexcept
 {
   return static_cast<unsigned>(__lhs) < static_cast<unsigned>(__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>(const day& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const day& __lhs, const day& __rhs) noexcept
 {
   return __rhs < __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<=(const day& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const day& __lhs, const day& __rhs) noexcept
 {
   return !(__rhs < __lhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>=(const day& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const day& __lhs, const day& __rhs) noexcept
 {
   return !(__lhs < __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr day operator+(const day& __lhs, const days& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr day operator+(const day& __lhs, const days& __rhs) noexcept
 {
   return day(static_cast<unsigned>(__lhs) + static_cast<unsigned>(__rhs.count()));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr day operator+(const days& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr day operator+(const days& __lhs, const day& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr day operator-(const day& __lhs, const days& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr day operator-(const day& __lhs, const days& __rhs) noexcept
 {
   return __lhs + -__rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr days operator-(const day& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr days operator-(const day& __lhs, const day& __rhs) noexcept
 {
   return days(static_cast<int>(static_cast<unsigned>(__lhs)) - static_cast<int>(static_cast<unsigned>(__rhs)));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr day& day::operator+=(const days& __dd) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr day& day::operator+=(const days& __dd) noexcept
 {
   *this = *this + __dd;
   return *this;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr day& day::operator-=(const days& __dd) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr day& day::operator-=(const days& __dd) noexcept
 {
   *this = *this - __dd;
   return *this;
@@ -1773,104 +1765,104 @@ private:
   unsigned char __m;
 
 public:
-  month() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY explicit inline constexpr month(unsigned __val) noexcept
+  _CCCL_HIDE_FROM_ABI month() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr month(unsigned __val) noexcept
       : __m(static_cast<unsigned char>(__val))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr month& operator++() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr month& operator++() noexcept
   {
     ++__m;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr month operator++(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr month operator++(int) noexcept
   {
     month __tmp = *this;
     ++(*this);
     return __tmp;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr month& operator--() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr month& operator--() noexcept
   {
     --__m;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr month operator--(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr month operator--(int) noexcept
   {
     month __tmp = *this;
     --(*this);
     return __tmp;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr month& operator+=(const months& __m1) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr month& operator-=(const months& __m1) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY explicit inline constexpr operator unsigned() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr month& operator+=(const months& __m1) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr month& operator-=(const months& __m1) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator unsigned() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __m >= 1 && __m <= 12;
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator==(const month& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const month& __lhs, const month& __rhs) noexcept
 {
   return static_cast<unsigned>(__lhs) == static_cast<unsigned>(__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator!=(const month& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const month& __lhs, const month& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<(const month& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const month& __lhs, const month& __rhs) noexcept
 {
   return static_cast<unsigned>(__lhs) < static_cast<unsigned>(__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>(const month& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const month& __lhs, const month& __rhs) noexcept
 {
   return __rhs < __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<=(const month& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const month& __lhs, const month& __rhs) noexcept
 {
   return !(__rhs < __lhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>=(const month& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const month& __lhs, const month& __rhs) noexcept
 {
   return !(__lhs < __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month operator+(const month& __lhs, const months& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month operator+(const month& __lhs, const months& __rhs) noexcept
 {
   auto const __mu = static_cast<long long>(static_cast<unsigned>(__lhs)) + (__rhs.count() - 1);
   auto const __yr = (__mu >= 0 ? __mu : __mu - 11) / 12;
   return month{static_cast<unsigned>(__mu - __yr * 12 + 1)};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month operator+(const months& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month operator+(const months& __lhs, const month& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month operator-(const month& __lhs, const months& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month operator-(const month& __lhs, const months& __rhs) noexcept
 {
   return __lhs + -__rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr months operator-(const month& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr months operator-(const month& __lhs, const month& __rhs) noexcept
 {
   auto const __dm = static_cast<unsigned>(__lhs) - static_cast<unsigned>(__rhs);
   return months(__dm <= 11 ? __dm : __dm + 12);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month& month::operator+=(const months& __dm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month& month::operator+=(const months& __dm) noexcept
 {
   *this = *this + __dm;
   return *this;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month& month::operator-=(const months& __dm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month& month::operator-=(const months& __dm) noexcept
 {
   *this = *this - __dm;
   return *this;
@@ -1882,126 +1874,126 @@ private:
   short __y;
 
 public:
-  year() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY explicit inline constexpr year(int __val) noexcept
+  _CCCL_HIDE_FROM_ABI year() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr year(int __val) noexcept
       : __y(static_cast<short>(__val))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year& operator++() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year& operator++() noexcept
   {
     ++__y;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year operator++(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year operator++(int) noexcept
   {
     year __tmp = *this;
     ++(*this);
     return __tmp;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year& operator--() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year& operator--() noexcept
   {
     --__y;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year operator--(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year operator--(int) noexcept
   {
     year __tmp = *this;
     --(*this);
     return __tmp;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year& operator+=(const years& __dy) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year& operator-=(const years& __dy) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year operator+() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year& operator+=(const years& __dy) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year& operator-=(const years& __dy) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year operator+() const noexcept
   {
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year operator-() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year operator-() const noexcept
   {
     return year{-__y};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool is_leap() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_leap() const noexcept
   {
     return __y % 4 == 0 && (__y % 100 != 0 || __y % 400 == 0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY explicit inline constexpr operator int() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator int() const noexcept
   {
     return __y;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool ok() const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr year min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr year min() noexcept
   {
     return year{-32767};
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static inline constexpr year max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr year max() noexcept
   {
     return year{32767};
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator==(const year& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const year& __lhs, const year& __rhs) noexcept
 {
   return static_cast<int>(__lhs) == static_cast<int>(__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator!=(const year& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const year& __lhs, const year& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<(const year& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const year& __lhs, const year& __rhs) noexcept
 {
   return static_cast<int>(__lhs) < static_cast<int>(__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>(const year& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const year& __lhs, const year& __rhs) noexcept
 {
   return __rhs < __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<=(const year& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const year& __lhs, const year& __rhs) noexcept
 {
   return !(__rhs < __lhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>=(const year& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const year& __lhs, const year& __rhs) noexcept
 {
   return !(__lhs < __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year operator+(const year& __lhs, const years& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year operator+(const year& __lhs, const years& __rhs) noexcept
 {
   return year(static_cast<int>(__lhs) + __rhs.count());
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year operator+(const years& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year operator+(const years& __lhs, const year& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year operator-(const year& __lhs, const years& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year operator-(const year& __lhs, const years& __rhs) noexcept
 {
   return __lhs + -__rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr years operator-(const year& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr years operator-(const year& __lhs, const year& __rhs) noexcept
 {
   return years{static_cast<int>(__lhs) - static_cast<int>(__rhs)};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year& year::operator+=(const years& __dy) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year& year::operator+=(const years& __dy) noexcept
 {
   *this = *this + __dy;
   return *this;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year& year::operator-=(const years& __dy) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year& year::operator-=(const years& __dy) noexcept
 {
   *this = *this - __dy;
   return *this;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool year::ok() const noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool year::ok() const noexcept
 {
   return static_cast<int>(min()) <= __y && __y <= static_cast<int>(max());
 }
@@ -2015,113 +2007,113 @@ private:
   unsigned char __wd;
 
 public:
-  weekday() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY inline explicit constexpr weekday(unsigned __val) noexcept
+  _CCCL_HIDE_FROM_ABI weekday() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr weekday(unsigned __val) noexcept
       : __wd(static_cast<unsigned char>(__val == 7 ? 0 : __val))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday(const sys_days& __sysd) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday(const sys_days& __sysd) noexcept
       : __wd(__weekday_from_days(__sysd.time_since_epoch().count()))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline explicit constexpr weekday(const local_days& __locd) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr weekday(const local_days& __locd) noexcept
       : __wd(__weekday_from_days(__locd.time_since_epoch().count()))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday& operator++() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday& operator++() noexcept
   {
     __wd = (__wd == 6 ? 0 : __wd + 1);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday operator++(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday operator++(int) noexcept
   {
     weekday __tmp = *this;
     ++(*this);
     return __tmp;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday& operator--() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday& operator--() noexcept
   {
     __wd = (__wd == 0 ? 6 : __wd - 1);
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday operator--(int) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday operator--(int) noexcept
   {
     weekday __tmp = *this;
     --(*this);
     return __tmp;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr weekday& operator+=(const days& __dd) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr weekday& operator-=(const days& __dd) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr unsigned c_encoding() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday& operator+=(const days& __dd) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday& operator-=(const days& __dd) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr unsigned c_encoding() const noexcept
   {
     return __wd;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr unsigned iso_encoding() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr unsigned iso_encoding() const noexcept
   {
     return __wd == 0u ? 7 : __wd;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __wd <= 6;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr weekday_indexed operator[](unsigned __index) const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr weekday_last operator[](last_spec) const noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday_indexed operator[](unsigned __index) const noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday_last operator[](last_spec) const noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr unsigned char __weekday_from_days(int __days) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr unsigned char __weekday_from_days(int __days) noexcept;
 };
 
 // https://howardhinnant.github.io/date_algorithms.html#weekday_from_days
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr unsigned char weekday::__weekday_from_days(int __days) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr unsigned char weekday::__weekday_from_days(int __days) noexcept
 {
   return static_cast<unsigned char>(static_cast<unsigned>(__days >= -4 ? (__days + 4) % 7 : (__days + 5) % 7 + 6));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator==(const weekday& __lhs, const weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const weekday& __lhs, const weekday& __rhs) noexcept
 {
   return __lhs.c_encoding() == __rhs.c_encoding();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator!=(const weekday& __lhs, const weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const weekday& __lhs, const weekday& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<(const weekday& __lhs, const weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const weekday& __lhs, const weekday& __rhs) noexcept
 {
   return __lhs.c_encoding() < __rhs.c_encoding();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>(const weekday& __lhs, const weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const weekday& __lhs, const weekday& __rhs) noexcept
 {
   return __rhs < __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<=(const weekday& __lhs, const weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const weekday& __lhs, const weekday& __rhs) noexcept
 {
   return !(__rhs < __lhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>=(const weekday& __lhs, const weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const weekday& __lhs, const weekday& __rhs) noexcept
 {
   return !(__lhs < __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr weekday operator+(const weekday& __lhs, const days& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr weekday operator+(const weekday& __lhs, const days& __rhs) noexcept
 {
   auto const __mu = static_cast<long long>(__lhs.c_encoding()) + __rhs.count();
   auto const __yr = (__mu >= 0 ? __mu : __mu - 6) / 7;
   return weekday{static_cast<unsigned>(__mu - __yr * 7)};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr weekday operator+(const days& __lhs, const weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr weekday operator+(const days& __lhs, const weekday& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr weekday operator-(const weekday& __lhs, const days& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr weekday operator-(const weekday& __lhs, const days& __rhs) noexcept
 {
   return __lhs + -__rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr days operator-(const weekday& __lhs, const weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr days operator-(const weekday& __lhs, const weekday& __rhs) noexcept
 {
   // casts are required to work around nvcc bug 3145483
   const int __wdu = static_cast<int>(__lhs.c_encoding()) - static_cast<int>(__rhs.c_encoding());
@@ -2129,13 +2121,13 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr days operator-(const weekday& __lhs, con
   return days{__wdu - __wk * 7};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday& weekday::operator+=(const days& __dd) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr weekday& weekday::operator+=(const days& __dd) noexcept
 {
   *this = *this + __dd;
   return *this;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday& weekday::operator-=(const days& __dd) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr weekday& weekday::operator-=(const days& __dd) noexcept
 {
   *this = *this - __dd;
   return *this;
@@ -2148,34 +2140,32 @@ private:
   unsigned char __idx;
 
 public:
-  weekday_indexed() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday_indexed(
+  _CCCL_HIDE_FROM_ABI weekday_indexed() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr weekday_indexed(
     const _CUDA_VSTD::chrono::weekday& __wdval, unsigned __idxval) noexcept
       : __wd{__wdval}
       , __idx(static_cast<unsigned char>(__idxval))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr _CUDA_VSTD::chrono::weekday weekday() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::chrono::weekday weekday() const noexcept
   {
     return __wd;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr unsigned index() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr unsigned index() const noexcept
   {
     return __idx;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __wd.ok() && __idx >= 1 && __idx <= 5;
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator==(const weekday_indexed& __lhs, const weekday_indexed& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const weekday_indexed& __lhs, const weekday_indexed& __rhs) noexcept
 {
   return __lhs.weekday() == __rhs.weekday() && __lhs.index() == __rhs.index();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator!=(const weekday_indexed& __lhs, const weekday_indexed& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const weekday_indexed& __lhs, const weekday_indexed& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
@@ -2186,37 +2176,35 @@ private:
   _CUDA_VSTD::chrono::weekday __wd;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr weekday_last(const _CUDA_VSTD::chrono::weekday& __val) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr weekday_last(const _CUDA_VSTD::chrono::weekday& __val) noexcept
       : __wd{__val}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _CUDA_VSTD::chrono::weekday weekday() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::chrono::weekday weekday() const noexcept
   {
     return __wd;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __wd.ok();
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator==(const weekday_last& __lhs, const weekday_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const weekday_last& __lhs, const weekday_last& __rhs) noexcept
 {
   return __lhs.weekday() == __rhs.weekday();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator!=(const weekday_last& __lhs, const weekday_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const weekday_last& __lhs, const weekday_last& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday_indexed weekday::operator[](unsigned __index) const noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr weekday_indexed weekday::operator[](unsigned __index) const noexcept
 {
   return weekday_indexed{*this, __index};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr weekday_last weekday::operator[](last_spec) const noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr weekday_last weekday::operator[](last_spec) const noexcept
 {
   return weekday_last{*this};
 }
@@ -2250,23 +2238,23 @@ private:
   chrono::day __d;
 
 public:
-  month_day() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr month_day(const chrono::month& __mval, const chrono::day& __dval) noexcept
+  _CCCL_HIDE_FROM_ABI month_day() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr month_day(const chrono::month& __mval, const chrono::day& __dval) noexcept
       : __m{__mval}
       , __d{__dval}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month month() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month month() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::day day() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::day day() const noexcept
   {
     return __d;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool ok() const noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept;
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool month_day::ok() const noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool month_day::ok() const noexcept
 {
   if (!__m.ok())
   {
@@ -2294,57 +2282,57 @@ _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool month_day::ok() const noexce
   return true;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator==(const month_day& __lhs, const month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const month_day& __lhs, const month_day& __rhs) noexcept
 {
   return __lhs.month() == __rhs.month() && __lhs.day() == __rhs.day();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator!=(const month_day& __lhs, const month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const month_day& __lhs, const month_day& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_day operator/(const month& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_day operator/(const month& __lhs, const day& __rhs) noexcept
 {
   return month_day{__lhs, __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr month_day operator/(const day& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_day operator/(const day& __lhs, const month& __rhs) noexcept
 {
   return __rhs / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_day operator/(const month& __lhs, int __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_day operator/(const month& __lhs, int __rhs) noexcept
 {
   return __lhs / day(__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr month_day operator/(int __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_day operator/(int __lhs, const day& __rhs) noexcept
 {
   return month(__lhs) / __rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr month_day operator/(const day& __lhs, int __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_day operator/(const day& __lhs, int __rhs) noexcept
 {
   return month(__rhs) / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<(const month_day& __lhs, const month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const month_day& __lhs, const month_day& __rhs) noexcept
 {
   return __lhs.month() != __rhs.month() ? __lhs.month() < __rhs.month() : __lhs.day() < __rhs.day();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>(const month_day& __lhs, const month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const month_day& __lhs, const month_day& __rhs) noexcept
 {
   return __rhs < __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<=(const month_day& __lhs, const month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const month_day& __lhs, const month_day& __rhs) noexcept
 {
   return !(__rhs < __lhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>=(const month_day& __lhs, const month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const month_day& __lhs, const month_day& __rhs) noexcept
 {
   return !(__lhs < __rhs);
 }
@@ -2355,71 +2343,65 @@ private:
   chrono::month __m;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr month_day_last(const chrono::month& __val) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr month_day_last(const chrono::month& __val) noexcept
       : __m{__val}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month month() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month month() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __m.ok();
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator==(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
 {
   return __lhs.month() == __rhs.month();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator!=(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator<(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
 {
   return __lhs.month() < __rhs.month();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator>(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
 {
   return __rhs < __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator<=(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
 {
   return !(__rhs < __lhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator>=(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const month_day_last& __lhs, const month_day_last& __rhs) noexcept
 {
   return !(__lhs < __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_day_last operator/(const month& __lhs, last_spec) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_day_last operator/(const month& __lhs, last_spec) noexcept
 {
   return month_day_last{__lhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_day_last operator/(last_spec, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_day_last operator/(last_spec, const month& __rhs) noexcept
 {
   return month_day_last{__rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_day_last operator/(int __lhs, last_spec) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_day_last operator/(int __lhs, last_spec) noexcept
 {
   return month_day_last{month(__lhs)};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_day_last operator/(last_spec, int __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_day_last operator/(last_spec, int __rhs) noexcept
 {
   return month_day_last{month(__rhs)};
 }
@@ -2431,56 +2413,52 @@ private:
   chrono::weekday_indexed __wdi;
 
 public:
-  month_weekday() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr month_weekday(
-    const chrono::month& __mval, const chrono::weekday_indexed& __wdival) noexcept
+  _CCCL_HIDE_FROM_ABI month_weekday() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday(const chrono::month& __mval,
+                                                    const chrono::weekday_indexed& __wdival) noexcept
       : __m{__mval}
       , __wdi{__wdival}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month month() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month month() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::weekday_indexed weekday_indexed() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::weekday_indexed weekday_indexed() const noexcept
   {
     return __wdi;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __m.ok() && __wdi.ok();
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator==(const month_weekday& __lhs, const month_weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const month_weekday& __lhs, const month_weekday& __rhs) noexcept
 {
   return __lhs.month() == __rhs.month() && __lhs.weekday_indexed() == __rhs.weekday_indexed();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator!=(const month_weekday& __lhs, const month_weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const month_weekday& __lhs, const month_weekday& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_weekday
-operator/(const month& __lhs, const weekday_indexed& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday operator/(const month& __lhs, const weekday_indexed& __rhs) noexcept
 {
   return month_weekday{__lhs, __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_weekday operator/(int __lhs, const weekday_indexed& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday operator/(int __lhs, const weekday_indexed& __rhs) noexcept
 {
   return month_weekday{month(__lhs), __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_weekday
-operator/(const weekday_indexed& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday operator/(const weekday_indexed& __lhs, const month& __rhs) noexcept
 {
   return month_weekday{__rhs, __lhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_weekday operator/(const weekday_indexed& __lhs, int __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday operator/(const weekday_indexed& __lhs, int __rhs) noexcept
 {
   return month_weekday{month(__rhs), __lhs};
 }
@@ -2491,57 +2469,53 @@ class month_weekday_last
   chrono::weekday_last __wdl;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr month_weekday_last(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday_last(
     const chrono::month& __mval, const chrono::weekday_last& __wdlval) noexcept
       : __m{__mval}
       , __wdl{__wdlval}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month month() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month month() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::weekday_last weekday_last() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::weekday_last weekday_last() const noexcept
   {
     return __wdl;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __m.ok() && __wdl.ok();
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator==(const month_weekday_last& __lhs, const month_weekday_last& __rhs) noexcept
 {
   return __lhs.month() == __rhs.month() && __lhs.weekday_last() == __rhs.weekday_last();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator!=(const month_weekday_last& __lhs, const month_weekday_last& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_weekday_last
-operator/(const month& __lhs, const weekday_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday_last operator/(const month& __lhs, const weekday_last& __rhs) noexcept
 {
   return month_weekday_last{__lhs, __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_weekday_last
-operator/(int __lhs, const weekday_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday_last operator/(int __lhs, const weekday_last& __rhs) noexcept
 {
   return month_weekday_last{month(__lhs), __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_weekday_last
-operator/(const weekday_last& __lhs, const month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday_last operator/(const weekday_last& __lhs, const month& __rhs) noexcept
 {
   return month_weekday_last{__rhs, __lhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr month_weekday_last
-operator/(const weekday_last& __lhs, int __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr month_weekday_last operator/(const weekday_last& __lhs, int __rhs) noexcept
 {
   return month_weekday_last{month(__rhs), __lhs};
 }
@@ -2552,86 +2526,86 @@ class year_month
   chrono::month __m;
 
 public:
-  year_month() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month(const chrono::year& __yval, const chrono::month& __mval) noexcept
+  _CCCL_HIDE_FROM_ABI year_month() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month(const chrono::year& __yval, const chrono::month& __mval) noexcept
       : __y{__yval}
       , __m{__mval}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::year year() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::year year() const noexcept
   {
     return __y;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month month() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month month() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month& operator+=(const months& __dm) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month& operator+=(const months& __dm) noexcept
   {
     this->__m += __dm;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month& operator-=(const months& __dm) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month& operator-=(const months& __dm) noexcept
   {
     this->__m -= __dm;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month& operator+=(const years& __dy) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month& operator+=(const years& __dy) noexcept
   {
     this->__y += __dy;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month& operator-=(const years& __dy) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month& operator-=(const years& __dy) noexcept
   {
     this->__y -= __dy;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __y.ok() && __m.ok();
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month operator/(const year& __y, const month& __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month operator/(const year& __y, const month& __m) noexcept
 {
   return year_month{__y, __m};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month operator/(const year& __y, int __m) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month operator/(const year& __y, int __m) noexcept
 {
   return year_month{__y, month(__m)};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator==(const year_month& __lhs, const year_month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const year_month& __lhs, const year_month& __rhs) noexcept
 {
   return __lhs.year() == __rhs.year() && __lhs.month() == __rhs.month();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator!=(const year_month& __lhs, const year_month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const year_month& __lhs, const year_month& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<(const year_month& __lhs, const year_month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const year_month& __lhs, const year_month& __rhs) noexcept
 {
   return __lhs.year() != __rhs.year() ? __lhs.year() < __rhs.year() : __lhs.month() < __rhs.month();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>(const year_month& __lhs, const year_month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const year_month& __lhs, const year_month& __rhs) noexcept
 {
   return __rhs < __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator<=(const year_month& __lhs, const year_month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const year_month& __lhs, const year_month& __rhs) noexcept
 {
   return !(__rhs < __lhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool operator>=(const year_month& __lhs, const year_month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const year_month& __lhs, const year_month& __rhs) noexcept
 {
   return !(__lhs < __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr year_month operator+(const year_month& __lhs, const months& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month operator+(const year_month& __lhs, const months& __rhs) noexcept
 {
   int __dmi      = static_cast<int>(static_cast<unsigned>(__lhs.month())) - 1 + __rhs.count();
   const int __dy = (__dmi >= 0 ? __dmi : __dmi - 11) / 12;
@@ -2639,33 +2613,33 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month operator+(const year_month& _
   return (__lhs.year() + years(__dy)) / month(static_cast<unsigned>(__dmi));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr year_month operator+(const months& __lhs, const year_month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month operator+(const months& __lhs, const year_month& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr year_month operator+(const year_month& __lhs, const years& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month operator+(const year_month& __lhs, const years& __rhs) noexcept
 {
   return (__lhs.year() + __rhs) / __lhs.month();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr year_month operator+(const years& __lhs, const year_month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month operator+(const years& __lhs, const year_month& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr months operator-(const year_month& __lhs, const year_month& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr months operator-(const year_month& __lhs, const year_month& __rhs) noexcept
 {
   return (__lhs.year() - __rhs.year())
        + months(static_cast<unsigned>(__lhs.month()) - static_cast<unsigned>(__rhs.month()));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr year_month operator-(const year_month& __lhs, const months& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month operator-(const year_month& __lhs, const months& __rhs) noexcept
 {
   return __lhs + -__rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr year_month operator-(const year_month& __lhs, const years& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month operator-(const year_month& __lhs, const years& __rhs) noexcept
 {
   return __lhs + -__rhs;
 }
@@ -2680,55 +2654,55 @@ private:
   chrono::day __d;
 
 public:
-  year_month_day() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day(
+  _CCCL_HIDE_FROM_ABI year_month_day() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day(
     const chrono::year& __yval, const chrono::month& __mval, const chrono::day& __dval) noexcept
       : __y{__yval}
       , __m{__mval}
       , __d{__dval}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day(const year_month_day_last& __ymdl) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day(const sys_days& __sysd) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day(const year_month_day_last& __ymdl) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day(const sys_days& __sysd) noexcept
       : year_month_day(__from_days(__sysd.time_since_epoch()))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline explicit constexpr year_month_day(const local_days& __locd) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr year_month_day(const local_days& __locd) noexcept
       : year_month_day(__from_days(__locd.time_since_epoch()))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day& operator+=(const months& __dm) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day& operator-=(const months& __dm) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day& operator+=(const years& __dy) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day& operator-=(const years& __dy) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day& operator+=(const months& __dm) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day& operator-=(const months& __dm) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day& operator+=(const years& __dy) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day& operator-=(const years& __dy) noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::year year() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::year year() const noexcept
   {
     return __y;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month month() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month month() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::day day() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::day day() const noexcept
   {
     return __d;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr operator sys_days() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator sys_days() const noexcept
   {
     return sys_days{__to_days()};
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline explicit constexpr operator local_days() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator local_days() const noexcept
   {
     return local_days{__to_days()};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool ok() const noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr year_month_day __from_days(days __d) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr days __to_days() const noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr year_month_day __from_days(days __d) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr days __to_days() const noexcept;
 };
 
 // https://howardhinnant.github.io/date_algorithms.html#civil_from_days
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day year_month_day::__from_days(days __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day year_month_day::__from_days(days __d) noexcept
 {
   static_assert(std::numeric_limits<unsigned>::digits >= 18, "");
   static_assert(std::numeric_limits<int>::digits >= 20, "");
@@ -2745,7 +2719,7 @@ _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day year_month_day::__
 }
 
 // https://howardhinnant.github.io/date_algorithms.html#days_from_civil
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr days year_month_day::__to_days() const noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr days year_month_day::__to_days() const noexcept
 {
   static_assert(std::numeric_limits<unsigned>::digits >= 18, "");
   static_assert(std::numeric_limits<int>::digits >= 20, "");
@@ -2765,20 +2739,17 @@ _LIBCUDACXX_INLINE_VISIBILITY inline constexpr days year_month_day::__to_days()
   return days{__era * 146097 + static_cast<int>(__doe) - 719468};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator==(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
 {
   return __lhs.year() == __rhs.year() && __lhs.month() == __rhs.month() && __lhs.day() == __rhs.day();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator!=(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator<(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
 {
   if (__lhs.year() < __rhs.year())
   {
@@ -2799,109 +2770,97 @@ operator<(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
   return __lhs.day() < __rhs.day();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator>(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
 {
   return __rhs < __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator<=(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
 {
   return !(__rhs < __lhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
-operator>=(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const year_month_day& __lhs, const year_month_day& __rhs) noexcept
 {
   return !(__lhs < __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day
-operator/(const year_month& __lhs, const day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator/(const year_month& __lhs, const day& __rhs) noexcept
 {
   return year_month_day{__lhs.year(), __lhs.month(), __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day operator/(const year_month& __lhs, int __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator/(const year_month& __lhs, int __rhs) noexcept
 {
   return __lhs / day(__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day
-operator/(const year& __lhs, const month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator/(const year& __lhs, const month_day& __rhs) noexcept
 {
   return __lhs / __rhs.month() / __rhs.day();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day operator/(int __lhs, const month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator/(int __lhs, const month_day& __rhs) noexcept
 {
   return year(__lhs) / __rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day
-operator/(const month_day& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator/(const month_day& __lhs, const year& __rhs) noexcept
 {
   return __rhs / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day operator/(const month_day& __lhs, int __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator/(const month_day& __lhs, int __rhs) noexcept
 {
   return year(__rhs) / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day
-operator+(const year_month_day& __lhs, const months& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator+(const year_month_day& __lhs, const months& __rhs) noexcept
 {
   return (__lhs.year() / __lhs.month() + __rhs) / __lhs.day();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day
-operator+(const months& __lhs, const year_month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator+(const months& __lhs, const year_month_day& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day
-operator-(const year_month_day& __lhs, const months& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator-(const year_month_day& __lhs, const months& __rhs) noexcept
 {
   return __lhs + -__rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day
-operator+(const year_month_day& __lhs, const years& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator+(const year_month_day& __lhs, const years& __rhs) noexcept
 {
   return (__lhs.year() + __rhs) / __lhs.month() / __lhs.day();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day
-operator+(const years& __lhs, const year_month_day& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator+(const years& __lhs, const year_month_day& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day
-operator-(const year_month_day& __lhs, const years& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day operator-(const year_month_day& __lhs, const years& __rhs) noexcept
 {
   return __lhs + -__rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day& year_month_day::operator+=(const months& __dm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day& year_month_day::operator+=(const months& __dm) noexcept
 {
   *this = *this + __dm;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day& year_month_day::operator-=(const months& __dm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day& year_month_day::operator-=(const months& __dm) noexcept
 {
   *this = *this - __dm;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day& year_month_day::operator+=(const years& __dy) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day& year_month_day::operator+=(const years& __dy) noexcept
 {
   *this = *this + __dy;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day& year_month_day::operator-=(const years& __dy) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day& year_month_day::operator-=(const years& __dy) noexcept
 {
   *this = *this - __dy;
   return *this;
@@ -2914,45 +2873,44 @@ private:
   chrono::month_day_last __mdl;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day_last(
-    const year& __yval, const month_day_last& __mdlval) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last(const year& __yval, const month_day_last& __mdlval) noexcept
       : __y{__yval}
       , __mdl{__mdlval}
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day_last& operator+=(const months& __m) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day_last& operator-=(const months& __m) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day_last& operator+=(const years& __y) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_day_last& operator-=(const years& __y) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last& operator+=(const months& __m) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last& operator-=(const months& __m) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last& operator+=(const years& __y) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last& operator-=(const years& __y) noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::year year() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::year year() const noexcept
   {
     return __y;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month month() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month month() const noexcept
   {
     return __mdl.month();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month_day_last month_day_last() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month_day_last month_day_last() const noexcept
   {
     return __mdl;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::day day() const noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr operator sys_days() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::day day() const noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator sys_days() const noexcept
   {
     return sys_days{year() / month() / day()};
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline explicit constexpr operator local_days() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator local_days() const noexcept
   {
     return local_days{year() / month() / day()};
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __y.ok() && __mdl.ok();
   }
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::day year_month_day_last::day() const noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::day year_month_day_last::day() const noexcept
 {
   constexpr chrono::day __d[] = {
     chrono::day(31),
@@ -2974,19 +2932,19 @@ _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::day year_month_day_last::
   return month() != __Feb || !__y.is_leap() ? __d[static_cast<unsigned>(month()) - 1] : chrono::day{29};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator==(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept
 {
   return __lhs.year() == __rhs.year() && __lhs.month_day_last() == __rhs.month_day_last();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator!=(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator<(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept
 {
   if (__lhs.year() < __rhs.year())
@@ -3000,121 +2958,115 @@ operator<(const year_month_day_last& __lhs, const year_month_day_last& __rhs) no
   return __lhs.month_day_last() < __rhs.month_day_last();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator>(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept
 {
   return __rhs < __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator<=(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept
 {
   return !(__rhs < __lhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator>=(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept
 {
   return !(__lhs < __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last operator/(const year_month& __lhs, last_spec) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last operator/(const year_month& __lhs, last_spec) noexcept
 {
   return year_month_day_last{__lhs.year(), month_day_last{__lhs.month()}};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last
 operator/(const year& __lhs, const month_day_last& __rhs) noexcept
 {
   return year_month_day_last{__lhs, __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
-operator/(int __lhs, const month_day_last& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last operator/(int __lhs, const month_day_last& __rhs) noexcept
 {
   return year_month_day_last{year{__lhs}, __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last
 operator/(const month_day_last& __lhs, const year& __rhs) noexcept
 {
   return __rhs / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
-operator/(const month_day_last& __lhs, int __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last operator/(const month_day_last& __lhs, int __rhs) noexcept
 {
   return year{__rhs} / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last
 operator+(const year_month_day_last& __lhs, const months& __rhs) noexcept
 {
   return (__lhs.year() / __lhs.month() + __rhs) / last;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last
 operator+(const months& __lhs, const year_month_day_last& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last
 operator-(const year_month_day_last& __lhs, const months& __rhs) noexcept
 {
   return __lhs + (-__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last
 operator+(const year_month_day_last& __lhs, const years& __rhs) noexcept
 {
   return year_month_day_last{__lhs.year() + __rhs, __lhs.month_day_last()};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last
 operator+(const years& __lhs, const year_month_day_last& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last
 operator-(const year_month_day_last& __lhs, const years& __rhs) noexcept
 {
   return __lhs + (-__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last&
-year_month_day_last::operator+=(const months& __dm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last& year_month_day_last::operator+=(const months& __dm) noexcept
 {
   *this = *this + __dm;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last&
-year_month_day_last::operator-=(const months& __dm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last& year_month_day_last::operator-=(const months& __dm) noexcept
 {
   *this = *this - __dm;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last&
-year_month_day_last::operator+=(const years& __dy) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last& year_month_day_last::operator+=(const years& __dy) noexcept
 {
   *this = *this + __dy;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day_last&
-year_month_day_last::operator-=(const years& __dy) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day_last& year_month_day_last::operator-=(const years& __dy) noexcept
 {
   *this = *this - __dy;
   return *this;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_day::year_month_day(const year_month_day_last& __ymdl) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_day::year_month_day(const year_month_day_last& __ymdl) noexcept
     : __y{__ymdl.year()}
     , __m{__ymdl.month()}
     , __d{__ymdl.day()}
 {}
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool year_month_day::ok() const noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool year_month_day::ok() const noexcept
 {
   if (!__y.ok() || !__m.ok())
   {
@@ -3130,54 +3082,54 @@ class year_month_weekday
   chrono::weekday_indexed __wdi;
 
 public:
-  year_month_weekday() = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday(
+  _CCCL_HIDE_FROM_ABI year_month_weekday() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday(
     const chrono::year& __yval, const chrono::month& __mval, const chrono::weekday_indexed& __wdival) noexcept
       : __y{__yval}
       , __m{__mval}
       , __wdi{__wdival}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday(const sys_days& __sysd) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday(const sys_days& __sysd) noexcept
       : year_month_weekday(__from_days(__sysd.time_since_epoch()))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY inline explicit constexpr year_month_weekday(const local_days& __locd) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr year_month_weekday(const local_days& __locd) noexcept
       : year_month_weekday(__from_days(__locd.time_since_epoch()))
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday& operator+=(const months& m) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday& operator-=(const months& m) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday& operator+=(const years& y) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday& operator-=(const years& y) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday& operator+=(const months& m) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday& operator-=(const months& m) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday& operator+=(const years& y) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday& operator-=(const years& y) noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::year year() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::year year() const noexcept
   {
     return __y;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month month() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month month() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::weekday weekday() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::weekday weekday() const noexcept
   {
     return __wdi.weekday();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr unsigned index() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr unsigned index() const noexcept
   {
     return __wdi.index();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::weekday_indexed weekday_indexed() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::weekday_indexed weekday_indexed() const noexcept
   {
     return __wdi;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr operator sys_days() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator sys_days() const noexcept
   {
     return sys_days{__to_days()};
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline explicit constexpr operator local_days() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator local_days() const noexcept
   {
     return local_days{__to_days()};
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     if (!__y.ok() || !__m.ok() || !__wdi.ok())
     {
@@ -3187,11 +3139,11 @@ public:
     return true;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr year_month_weekday __from_days(days __d) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr days __to_days() const noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr year_month_weekday __from_days(days __d) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr days __to_days() const noexcept;
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday year_month_weekday::__from_days(days __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday year_month_weekday::__from_days(days __d) noexcept
 {
   const sys_days __sysd{__d};
   const chrono::weekday __wd = chrono::weekday(__sysd);
@@ -3199,111 +3151,103 @@ _LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday year_month_wee
   return year_month_weekday{__ymd.year(), __ymd.month(), __wd[(static_cast<unsigned>(__ymd.day()) - 1) / 7 + 1]};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr days year_month_weekday::__to_days() const noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr days year_month_weekday::__to_days() const noexcept
 {
   const sys_days __sysd = sys_days(__y / __m / 1);
   return (__sysd + (__wdi.weekday() - chrono::weekday(__sysd) + days{(__wdi.index() - 1) * 7})).time_since_epoch();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator==(const year_month_weekday& __lhs, const year_month_weekday& __rhs) noexcept
 {
   return __lhs.year() == __rhs.year() && __lhs.month() == __rhs.month()
       && __lhs.weekday_indexed() == __rhs.weekday_indexed();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator!=(const year_month_weekday& __lhs, const year_month_weekday& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday
 operator/(const year_month& __lhs, const weekday_indexed& __rhs) noexcept
 {
   return year_month_weekday{__lhs.year(), __lhs.month(), __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
-operator/(const year& __lhs, const month_weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday operator/(const year& __lhs, const month_weekday& __rhs) noexcept
 {
   return year_month_weekday{__lhs, __rhs.month(), __rhs.weekday_indexed()};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
-operator/(int __lhs, const month_weekday& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday operator/(int __lhs, const month_weekday& __rhs) noexcept
 {
   return year(__lhs) / __rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
-operator/(const month_weekday& __lhs, const year& __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday operator/(const month_weekday& __lhs, const year& __rhs) noexcept
 {
   return __rhs / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
-operator/(const month_weekday& __lhs, int __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday operator/(const month_weekday& __lhs, int __rhs) noexcept
 {
   return year(__rhs) / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday
 operator+(const year_month_weekday& __lhs, const months& __rhs) noexcept
 {
   return (__lhs.year() / __lhs.month() + __rhs) / __lhs.weekday_indexed();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday
 operator+(const months& __lhs, const year_month_weekday& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday
 operator-(const year_month_weekday& __lhs, const months& __rhs) noexcept
 {
   return __lhs + (-__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday
 operator+(const year_month_weekday& __lhs, const years& __rhs) noexcept
 {
   return year_month_weekday{__lhs.year() + __rhs, __lhs.month(), __lhs.weekday_indexed()};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday
 operator+(const years& __lhs, const year_month_weekday& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday
 operator-(const year_month_weekday& __lhs, const years& __rhs) noexcept
 {
   return __lhs + (-__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday&
-year_month_weekday::operator+=(const months& __dm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday& year_month_weekday::operator+=(const months& __dm) noexcept
 {
   *this = *this + __dm;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday&
-year_month_weekday::operator-=(const months& __dm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday& year_month_weekday::operator-=(const months& __dm) noexcept
 {
   *this = *this - __dm;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday&
-year_month_weekday::operator+=(const years& __dy) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday& year_month_weekday::operator+=(const years& __dy) noexcept
 {
   *this = *this + __dy;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday&
-year_month_weekday::operator-=(const years& __dy) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday& year_month_weekday::operator-=(const years& __dy) noexcept
 {
   *this = *this - __dy;
   return *this;
@@ -3317,159 +3261,159 @@ private:
   chrono::weekday_last __wdl;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday_last(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last(
     const chrono::year& __yval, const chrono::month& __mval, const chrono::weekday_last& __wdlval) noexcept
       : __y{__yval}
       , __m{__mval}
       , __wdl{__wdlval}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday_last& operator+=(const months& __dm) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday_last& operator-=(const months& __dm) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday_last& operator+=(const years& __dy) noexcept;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr year_month_weekday_last& operator-=(const years& __dy) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last& operator+=(const months& __dm) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last& operator-=(const months& __dm) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last& operator+=(const years& __dy) noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last& operator-=(const years& __dy) noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::year year() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::year year() const noexcept
   {
     return __y;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::month month() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::month month() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::weekday weekday() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::weekday weekday() const noexcept
   {
     return __wdl.weekday();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr chrono::weekday_last weekday_last() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::weekday_last weekday_last() const noexcept
   {
     return __wdl;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr operator sys_days() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator sys_days() const noexcept
   {
     return sys_days{__to_days()};
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline explicit constexpr operator local_days() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator local_days() const noexcept
   {
     return local_days{__to_days()};
   }
-  _LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool ok() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool ok() const noexcept
   {
     return __y.ok() && __m.ok() && __wdl.ok();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr days __to_days() const noexcept;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr days __to_days() const noexcept;
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr days year_month_weekday_last::__to_days() const noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr days year_month_weekday_last::__to_days() const noexcept
 {
   const sys_days __last = sys_days{__y / __m / last};
   return (__last - (chrono::weekday{__last} - __wdl.weekday())).time_since_epoch();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator==(const year_month_weekday_last& __lhs, const year_month_weekday_last& __rhs) noexcept
 {
   return __lhs.year() == __rhs.year() && __lhs.month() == __rhs.month() && __lhs.weekday_last() == __rhs.weekday_last();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr bool
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool
 operator!=(const year_month_weekday_last& __lhs, const year_month_weekday_last& __rhs) noexcept
 {
   return !(__lhs == __rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator/(const year_month& __lhs, const weekday_last& __rhs) noexcept
 {
   return year_month_weekday_last{__lhs.year(), __lhs.month(), __rhs};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator/(const year& __lhs, const month_weekday_last& __rhs) noexcept
 {
   return year_month_weekday_last{__lhs, __rhs.month(), __rhs.weekday_last()};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator/(int __lhs, const month_weekday_last& __rhs) noexcept
 {
   return year(__lhs) / __rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator/(const month_weekday_last& __lhs, const year& __rhs) noexcept
 {
   return __rhs / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator/(const month_weekday_last& __lhs, int __rhs) noexcept
 {
   return year(__rhs) / __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator+(const year_month_weekday_last& __lhs, const months& __rhs) noexcept
 {
   return (__lhs.year() / __lhs.month() + __rhs) / __lhs.weekday_last();
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator+(const months& __lhs, const year_month_weekday_last& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator-(const year_month_weekday_last& __lhs, const months& __rhs) noexcept
 {
   return __lhs + (-__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator+(const year_month_weekday_last& __lhs, const years& __rhs) noexcept
 {
   return year_month_weekday_last{__lhs.year() + __rhs, __lhs.month(), __lhs.weekday_last()};
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator+(const years& __lhs, const year_month_weekday_last& __rhs) noexcept
 {
   return __rhs + __lhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last
 operator-(const year_month_weekday_last& __lhs, const years& __rhs) noexcept
 {
   return __lhs + (-__rhs);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last&
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last&
 year_month_weekday_last::operator+=(const months& __dm) noexcept
 {
   *this = *this + __dm;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last&
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last&
 year_month_weekday_last::operator-=(const months& __dm) noexcept
 {
   *this = *this - __dm;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last&
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last&
 year_month_weekday_last::operator+=(const years& __dy) noexcept
 {
   *this = *this + __dy;
   return *this;
 }
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr year_month_weekday_last&
+_LIBCUDACXX_HIDE_FROM_ABI constexpr year_month_weekday_last&
 year_month_weekday_last::operator-=(const years& __dy) noexcept
 {
   *this = *this - __dy;
   return *this;
 }
 
-_CCCL_NODISCARD inline _LIBCUDACXX_INLINE_VISIBILITY constexpr unsigned
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr unsigned
 __hh_mm_ss_width(uint64_t __n, uint64_t __d = 10, unsigned __w = 0)
 {
   if (__n >= 2 && __d != 0 && __w < 19)
@@ -3486,7 +3430,7 @@ private:
   static_assert(__is_duration<_Duration>::value, "template parameter of hh_mm_ss must be a std::chrono::duration");
   using __CommonType = typename common_type<_Duration, chrono::seconds>::type;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr uint64_t __pow10(unsigned __exp)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr uint64_t __pow10(unsigned __exp)
   {
     uint64_t __ret = 1;
     for (unsigned __i = 0; __i < __exp; ++__i)
@@ -3501,11 +3445,11 @@ public:
     __hh_mm_ss_width(__CommonType::period::den) < 19 ? __hh_mm_ss_width(__CommonType::period::den) : 6u;
   using precision = duration<typename __CommonType::rep, ratio<1, __pow10(fractional_width)>>;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr hh_mm_ss() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr hh_mm_ss() noexcept
       : hh_mm_ss{_Duration::zero()}
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit hh_mm_ss(_Duration __d) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit hh_mm_ss(_Duration __d) noexcept
       : __is_neg(__d < _Duration(0))
       , __h(duration_cast<chrono::hours>(abs(__d)))
       , __m(duration_cast<chrono::minutes>(abs(__d) - hours()))
@@ -3513,34 +3457,34 @@ public:
       , __f(duration_cast<precision>(abs(__d) - hours() - minutes() - seconds()))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool is_negative() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_negative() const noexcept
   {
     return __is_neg;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::hours hours() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::hours hours() const noexcept
   {
     return __h;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::minutes minutes() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::minutes minutes() const noexcept
   {
     return __m;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::seconds seconds() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::seconds seconds() const noexcept
   {
     return __s;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr precision subseconds() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr precision subseconds() const noexcept
   {
     return __f;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr precision to_duration() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr precision to_duration() const noexcept
   {
     auto __dur = __h + __m + __s + __f;
     return __is_neg ? -__dur : __dur;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit operator precision() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit operator precision() const noexcept
   {
     return to_duration();
   }
@@ -3553,16 +3497,16 @@ private:
   precision __f;
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool is_am(const hours& __h) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_am(const hours& __h) noexcept
 {
   return __h >= hours(0) && __h < hours(12);
 }
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool is_pm(const hours& __h) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_pm(const hours& __h) noexcept
 {
   return __h >= hours(12) && __h < hours(24);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr hours make12(const hours& __h) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr hours make12(const hours& __h) noexcept
 {
   if (__h == hours(0))
   {
@@ -3578,7 +3522,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr hours make12(const hours& __h) noexcept
   }
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr hours make24(const hours& __h, bool __is_pm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr hours make24(const hours& __h, bool __is_pm) noexcept
 {
   if (__is_pm)
   {
@@ -3611,79 +3555,78 @@ inline namespace literals
 inline namespace chrono_literals
 {
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::hours operator""h(unsigned long long __h)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::hours operator""h(unsigned long long __h)
 {
   return chrono::hours(static_cast<chrono::hours::rep>(__h));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, ratio<3600, 1>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, ratio<3600, 1>>
 operator""h(long double __h)
 {
   return chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, ratio<3600, 1>>(__h);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::minutes operator""min(unsigned long long __m)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::minutes operator""min(unsigned long long __m)
 {
   return chrono::minutes(static_cast<chrono::minutes::rep>(__m));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, ratio<60, 1>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, ratio<60, 1>>
 operator""min(long double __m)
 {
   return chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, ratio<60, 1>>(__m);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::seconds operator""s(unsigned long long __s)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::seconds operator""s(unsigned long long __s)
 {
   return chrono::seconds(static_cast<chrono::seconds::rep>(__s));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T>
-operator""s(long double __s)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T> operator""s(long double __s)
 {
   return chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T>(__s);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::milliseconds operator""ms(unsigned long long __ms)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::milliseconds operator""ms(unsigned long long __ms)
 {
   return chrono::milliseconds(static_cast<chrono::milliseconds::rep>(__ms));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, milli>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, milli>
 operator""ms(long double __ms)
 {
   return chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, milli>(__ms);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::microseconds operator""us(unsigned long long __us)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::microseconds operator""us(unsigned long long __us)
 {
   return chrono::microseconds(static_cast<chrono::microseconds::rep>(__us));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, micro>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, micro>
 operator""us(long double __us)
 {
   return chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, micro>(__us);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::nanoseconds operator""ns(unsigned long long __ns)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::nanoseconds operator""ns(unsigned long long __ns)
 {
   return chrono::nanoseconds(static_cast<chrono::nanoseconds::rep>(__ns));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, nano>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, nano>
 operator""ns(long double __ns)
 {
   return chrono::duration<_LIBCUDACXX_CHRONO_LITERAL_INTERNAL_T, nano>(__ns);
 }
 
 #  if _CCCL_STD_VER > 2017 && !defined(_LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS)
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::day operator""d(unsigned long long __d) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::day operator""d(unsigned long long __d) noexcept
 {
   return chrono::day(static_cast<unsigned>(__d));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr chrono::year operator""y(unsigned long long __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr chrono::year operator""y(unsigned long long __y) noexcept
 {
   return chrono::year(static_cast<int>(__y));
 }
@@ -3718,17 +3661,17 @@ struct _FilesystemClock
   typedef chrono::duration<rep, period> duration;
   typedef chrono::time_point<_FilesystemClock> time_point;
 
-  _LIBCUDACXX_EXPORTED_FROM_ABI static _CCCL_CONSTEXPR_CXX14 const bool is_steady = false;
+  _CCCL_VISIBILITY_DEFAULT static _CCCL_CONSTEXPR_CXX14 const bool is_steady = false;
 
-  _LIBCUDACXX_AVAILABILITY_FILESYSTEM _LIBCUDACXX_INLINE_VISIBILITY static time_point now() noexcept;
+  _LIBCUDACXX_AVAILABILITY_FILESYSTEM _LIBCUDACXX_HIDE_FROM_ABI static time_point now() noexcept;
 
-  _LIBCUDACXX_INLINE_VISIBILITY static time_t to_time_t(const time_point& __t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static time_t to_time_t(const time_point& __t) noexcept
   {
     typedef chrono::duration<rep> __secs;
     return time_t(chrono::duration_cast<__secs>(__t.time_since_epoch()).count());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static time_point from_time_t(time_t __t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static time_point from_time_t(time_t __t) noexcept
   {
     typedef chrono::duration<rep> __secs;
     return time_point(__secs(__t));
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/climits b/libcudacxx/include/cuda/std/detail/libcxx/include/climits
index 28529bfe1d..ffe87d1caf 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/climits
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/climits
@@ -55,10 +55,6 @@ _CCCL_PUSH_MACROS
 #  include <cuda/std/detail/libcxx/include/support/win32/limits_msvc_win32.h>
 #endif // _LIBCUDACXX_MSVCRT
 
-#if defined(_CCCL_COMPILER_IBM)
-#  include <cuda/std/detail/libcxx/include/support/ibm/limits.h>
-#endif // _CCCL_COMPILER_IBM
-
 // ICC defines __CHAR_BIT__ by default
 // accept that, but assert it is what we expect
 #ifdef __CHAR_BIT__
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
index 74b84bdbdf..d2d09b7710 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
@@ -585,23 +585,23 @@ using ::truncl;
 #endif // _CCCL_COMPILER_NVRTC
 
 #if _CCCL_STD_VER > 2014 && !defined(__cuda_std__)
-inline _LIBCUDACXX_INLINE_VISIBILITY float hypot(float x, float y, float z)
+_LIBCUDACXX_HIDE_FROM_ABI float hypot(float x, float y, float z)
 {
   return sqrt(x * x + y * y + z * z);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY double hypot(double x, double y, double z)
+_LIBCUDACXX_HIDE_FROM_ABI double hypot(double x, double y, double z)
 {
   return sqrt(x * x + y * y + z * z);
 }
 #  ifdef _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE
-inline _LIBCUDACXX_INLINE_VISIBILITY long double hypot(long double x, long double y, long double z)
+_LIBCUDACXX_HIDE_FROM_ABI long double hypot(long double x, long double y, long double z)
 {
   return sqrt(x * x + y * y + z * z);
 }
 #  endif
 
 template <class _A1, class _A2, class _A3>
-inline _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 __enable_if_t<is_arithmetic<_A1>::value && is_arithmetic<_A2>::value && is_arithmetic<_A3>::value,
               __promote_t<_A1, _A2, _A3>>
 hypot(_A1 __lcpp_x, _A2 __lcpp_y, _A3 __lcpp_z) noexcept
@@ -621,7 +621,7 @@ hypot(_A1 __lcpp_x, _A2 __lcpp_y, _A3 __lcpp_z) noexcept
 #endif // _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
 
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<is_floating_point<_A1>::value, bool>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_floating_point<_A1>::value, bool>
 __constexpr_isnan(_A1 __lcpp_x) noexcept
 {
 #if defined(_CCCL_CUDACC_BELOW_11_8)
@@ -635,14 +635,14 @@ __constexpr_isnan(_A1 __lcpp_x) noexcept
 }
 
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<!is_floating_point<_A1>::value, bool>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<!is_floating_point<_A1>::value, bool>
 __constexpr_isnan(_A1 __lcpp_x) noexcept
 {
   return ::isnan(__lcpp_x);
 }
 
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<is_floating_point<_A1>::value, bool>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_floating_point<_A1>::value, bool>
 __constexpr_isinf(_A1 __lcpp_x) noexcept
 {
 #if defined(_CCCL_CUDACC_BELOW_11_8)
@@ -656,14 +656,14 @@ __constexpr_isinf(_A1 __lcpp_x) noexcept
 }
 
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<!is_floating_point<_A1>::value, bool>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<!is_floating_point<_A1>::value, bool>
 __constexpr_isinf(_A1 __lcpp_x) noexcept
 {
   return ::isinf(__lcpp_x);
 }
 
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<is_floating_point<_A1>::value, bool>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_floating_point<_A1>::value, bool>
 __constexpr_isfinite(_A1 __lcpp_x) noexcept
 {
 #if defined(_CCCL_CUDACC_BELOW_11_8)
@@ -677,7 +677,7 @@ __constexpr_isfinite(_A1 __lcpp_x) noexcept
 }
 
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<!is_floating_point<_A1>::value, bool>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<!is_floating_point<_A1>::value, bool>
 __constexpr_isfinite(_A1 __lcpp_x) noexcept
 {
   return isfinite(__lcpp_x);
@@ -685,31 +685,29 @@ __constexpr_isfinite(_A1 __lcpp_x) noexcept
 
 #if defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY _A1 __constexpr_copysign(_A1 __x, _A1 __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_copysign(_A1 __x, _A1 __y) noexcept
 {
   return ::copysign(__x, __y);
 }
 #else
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 float
-__constexpr_copysign(float __x, float __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 float __constexpr_copysign(float __x, float __y) noexcept
 {
   return __builtin_copysignf(__x, __y);
 }
 
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 double
-__constexpr_copysign(double __x, double __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_copysign(double __x, double __y) noexcept
 {
   return __builtin_copysign(__x, __y);
 }
 
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 long double
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 long double
 __constexpr_copysign(long double __x, long double __y) noexcept
 {
   return __builtin_copysignl(__x, __y);
 }
 
 template <class _A1, class _A2>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _CCCL_CONSTEXPR_CXX14 __enable_if_t<is_arithmetic<_A1>::value && is_arithmetic<_A2>::value, __promote_t<_A1, _A2>>
 __constexpr_copysign(_A1 __x, _A2 __y) noexcept
 {
@@ -721,32 +719,28 @@ __constexpr_copysign(_A1 __x, _A2 __y) noexcept
 
 #if defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY _A1 __constexpr_fabs(_A1 __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_fabs(_A1 __x) noexcept
 {
   return ::fabs(__x);
 }
 #else
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 float
-__constexpr_fabs(float __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 float __constexpr_fabs(float __x) noexcept
 {
   return __builtin_fabsf(__x);
 }
 
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 double
-__constexpr_fabs(double __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_fabs(double __x) noexcept
 {
   return __builtin_fabs(__x);
 }
 
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 long double
-__constexpr_fabs(long double __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 long double __constexpr_fabs(long double __x) noexcept
 {
   return __builtin_fabsl(__x);
 }
 
 template <class _Tp, __enable_if_t<is_integral<_Tp>::value, int> = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 double
-__constexpr_fabs(_Tp __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_fabs(_Tp __x) noexcept
 {
   return __builtin_fabs(static_cast<double>(__x));
 }
@@ -754,13 +748,12 @@ __constexpr_fabs(_Tp __x) noexcept
 
 #if defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY _A1 __constexpr_fmax(_A1 __x, _A1 __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_fmax(_A1 __x, _A1 __y) noexcept
 {
   return ::fmax(__x, __y);
 }
 #else
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX float
-__constexpr_fmax(float __x, float __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX float __constexpr_fmax(float __x, float __y) noexcept
 {
 #  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(_LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS)
 #    if defined(_CCCL_COMPILER_ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
@@ -783,8 +776,7 @@ __constexpr_fmax(float __x, float __y) noexcept
   return __builtin_fmaxf(__x, __y);
 }
 
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX double
-__constexpr_fmax(double __x, double __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX double __constexpr_fmax(double __x, double __y) noexcept
 {
 #  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(_LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS)
 #    if defined(_CCCL_COMPILER_ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
@@ -807,7 +799,7 @@ __constexpr_fmax(double __x, double __y) noexcept
   return __builtin_fmax(__x, __y);
 }
 
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX long double
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX long double
 __constexpr_fmax(long double __x, long double __y) noexcept
 {
 #  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(_LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS)
@@ -832,8 +824,7 @@ __constexpr_fmax(long double __x, long double __y) noexcept
 }
 
 template <class _Tp, class _Up, __enable_if_t<is_arithmetic<_Tp>::value && is_arithmetic<_Up>::value, int> = 0>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX __promote_t<_Tp, _Up>
-__constexpr_fmax(_Tp __x, _Up __y) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX __promote_t<_Tp, _Up> __constexpr_fmax(_Tp __x, _Up __y) noexcept
 {
   using __result_type = __promote_t<_Tp, _Up>;
   return _CUDA_VSTD::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
@@ -842,14 +833,13 @@ __constexpr_fmax(_Tp __x, _Up __y) noexcept
 
 #if defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG)
 template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY _A1 __constexpr_logb(_A1 __x)
+_LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_logb(_A1 __x)
 {
   return ::logb(__x);
 }
 #else
 template <class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp
-__constexpr_logb(_Tp __x)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_logb(_Tp __x)
 {
 #  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(_LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS)
   if (_LIBCUDACXX_IS_CONSTANT_EVALUATED())
@@ -886,34 +876,33 @@ __constexpr_logb(_Tp __x)
 
 #if defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG)
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _Tp __constexpr_scalbn(_Tp __x, int __i)
+_LIBCUDACXX_HIDE_FROM_ABI _Tp __constexpr_scalbn(_Tp __x, int __i)
 {
   return static_cast<_Tp>(::scalbn(static_cast<double>(__x), __i));
 }
 
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY float __constexpr_scalbn<float>(float __x, int __i)
+_LIBCUDACXX_HIDE_FROM_ABI float __constexpr_scalbn<float>(float __x, int __i)
 {
   return ::scalbnf(__x, __i);
 }
 
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY double __constexpr_scalbn<double>(double __x, int __i)
+_LIBCUDACXX_HIDE_FROM_ABI double __constexpr_scalbn<double>(double __x, int __i)
 {
   return ::scalbn(__x, __i);
 }
 
 #  ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
 template <>
-inline _LIBCUDACXX_INLINE_VISIBILITY long double __constexpr_scalbn<long double>(long double __x, int __i)
+_LIBCUDACXX_HIDE_FROM_ABI long double __constexpr_scalbn<long double>(long double __x, int __i)
 {
   return ::scalbnl(__x, __i);
 }
 #  endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
 #else
 template <class _Tp>
-inline _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp
-__constexpr_scalbn(_Tp __x, int __exp)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_scalbn(_Tp __x, int __exp)
 {
 #  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(_LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS)
   if (_LIBCUDACXX_IS_CONSTANT_EVALUATED())
@@ -973,7 +962,7 @@ __constexpr_scalbn(_Tp __x, int __exp)
 
 #if _CCCL_STD_VER > 2017
 template <typename _Fp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Fp __lerp(_Fp __a, _Fp __b, _Fp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Fp __lerp(_Fp __a, _Fp __b, _Fp __t) noexcept
 {
   if ((__a <= 0 && __b >= 0) || (__a >= 0 && __b <= 0))
   {
@@ -995,17 +984,17 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr _Fp __lerp(_Fp __a, _Fp __b, _Fp __t) no
   }
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr float lerp(float __a, float __b, float __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr float lerp(float __a, float __b, float __t) noexcept
 {
   return __lerp(__a, __b, __t);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr double lerp(double __a, double __b, double __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr double lerp(double __a, double __b, double __t) noexcept
 {
   return __lerp(__a, __b, __t);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr long double lerp(long double __a, long double __b, long double __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr long double lerp(long double __a, long double __b, long double __t) noexcept
 {
   return __lerp(__a, __b, __t);
 }
@@ -1016,7 +1005,7 @@ template <class _IntT,
           class _FloatT,
           bool _FloatBigger = (numeric_limits<_FloatT>::digits > numeric_limits<_IntT>::digits),
           int _Bits         = (numeric_limits<_IntT>::digits - numeric_limits<_FloatT>::digits)>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _IntT __max_representable_int_for_float() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _IntT __max_representable_int_for_float() noexcept
 {
   static_assert(is_floating_point<_FloatT>::value, "must be a floating point type");
   static_assert(is_integral<_IntT>::value, "must be an integral type");
@@ -1036,7 +1025,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr _IntT __max_representable_int_for_float(
 //
 // The behavior is undefined if `__r` is NaN.
 template <class _IntT, class _RealT>
-_LIBCUDACXX_INLINE_VISIBILITY _IntT __clamp_to_integral(_RealT __r) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _IntT __clamp_to_integral(_RealT __r) noexcept
 {
   using _Lim          = _CUDA_VSTD::numeric_limits<_IntT>;
   const _IntT _MaxVal = _CUDA_VSTD::__max_representable_int_for_float<_IntT, _RealT>();
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/complex b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
index b03b7d9ee6..5f9693a662 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/complex
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
@@ -279,7 +279,7 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_COMPLEX_ALIGNAS complex
+class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_COMPLEX_ALIGNAS complex
 {
   _Tp __re_;
   _Tp __im_;
@@ -290,14 +290,14 @@ class _LIBCUDACXX_TEMPLATE_VIS _LIBCUDACXX_COMPLEX_ALIGNAS complex
 public:
   using value_type = _Tp;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr complex(const value_type& __re = value_type(),
-                                                  const value_type& __im = value_type())
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr complex(const value_type& __re = value_type(),
+                                              const value_type& __im = value_type())
       : __re_(__re)
       , __im_(__im)
   {}
 
   template <class _Up, __enable_if_t<__complex_can_implicitly_construct<_Tp, _Up>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr complex(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr complex(const complex<_Up>& __c)
       : __re_(static_cast<_Tp>(__c.real()))
       , __im_(static_cast<_Tp>(__c.imag()))
   {}
@@ -305,12 +305,12 @@ public:
   template <class _Up,
             __enable_if_t<!__complex_can_implicitly_construct<_Tp, _Up>::value, int> = 0,
             __enable_if_t<_CCCL_TRAIT(is_constructible, _Tp, _Up), int>              = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr complex(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr complex(const complex<_Up>& __c)
       : __re_(static_cast<_Tp>(__c.real()))
       , __im_(static_cast<_Tp>(__c.imag()))
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex& operator=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex& operator=(const value_type& __re)
   {
     __re_ = __re;
     __im_ = value_type();
@@ -318,7 +318,7 @@ public:
   }
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex& operator=(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex& operator=(const complex<_Up>& __c)
   {
     __re_ = __c.real();
     __im_ = __c.imag();
@@ -327,13 +327,13 @@ public:
 
 #if !defined(_CCCL_COMPILER_NVRTC)
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY complex(const ::std::complex<_Up>& __other)
+  _LIBCUDACXX_HIDE_FROM_ABI complex(const ::std::complex<_Up>& __other)
       : __re_(_LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__other))
       , __im_(_LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__other))
   {}
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY complex& operator=(const ::std::complex<_Up>& __other)
+  _LIBCUDACXX_HIDE_FROM_ABI complex& operator=(const ::std::complex<_Up>& __other)
   {
     __re_ = _LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__other);
     __im_ = _LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__other);
@@ -346,60 +346,60 @@ public:
   }
 #endif // !_CCCL_COMPILER_NVRTC
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr value_type real() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr value_type real() const
   {
     return __re_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr value_type imag() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr value_type imag() const
   {
     return __im_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void real(value_type __re)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void real(value_type __re)
   {
     __re_ = __re;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 void imag(value_type __im)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void imag(value_type __im)
   {
     __im_ = __im;
   }
 
   // Those additional volatile overloads are meant to help with reductions in thrust
-  _LIBCUDACXX_INLINE_VISIBILITY value_type real() const volatile
+  _LIBCUDACXX_HIDE_FROM_ABI value_type real() const volatile
   {
     return __re_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY value_type imag() const volatile
+  _LIBCUDACXX_HIDE_FROM_ABI value_type imag() const volatile
   {
     return __im_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void real(value_type __re) volatile
+  _LIBCUDACXX_HIDE_FROM_ABI void real(value_type __re) volatile
   {
     __re_ = __re;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void imag(value_type __im) volatile
+  _LIBCUDACXX_HIDE_FROM_ABI void imag(value_type __im) volatile
   {
     __im_ = __im;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex& operator+=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex& operator+=(const value_type& __re)
   {
     __re_ += __re;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex& operator-=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex& operator-=(const value_type& __re)
   {
     __re_ -= __re;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex& operator*=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex& operator*=(const value_type& __re)
   {
     __re_ *= __re;
     __im_ *= __re;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex& operator/=(const value_type& __re)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex& operator/=(const value_type& __re)
   {
     __re_ /= __re;
     __im_ /= __re;
@@ -407,14 +407,14 @@ public:
   }
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex& operator+=(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex& operator+=(const complex<_Up>& __c)
   {
     __re_ += __c.real();
     __im_ += __c.imag();
     return *this;
   }
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex& operator-=(const complex<_Up>& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex& operator-=(const complex<_Up>& __c)
   {
     __re_ -= __c.real();
     __im_ -= __c.imag();
@@ -431,14 +431,14 @@ struct __is_complex<complex<_Tp>> : true_type
 {};
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX complex<_Tp>&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX complex<_Tp>&
 operator*=(complex<_Tp>& __lhs, const complex<_Up>& __rhs)
 {
   __lhs = __lhs * complex<_Tp>(__rhs.real(), __rhs.imag());
   return __lhs;
 }
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX complex<_Tp>&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX complex<_Tp>&
 operator/=(complex<_Tp>& __lhs, const complex<_Up>& __rhs)
 {
   __lhs = __lhs / complex<_Tp>(__rhs.real(), __rhs.imag());
@@ -447,8 +447,7 @@ operator/=(complex<_Tp>& __lhs, const complex<_Up>& __rhs)
 
 // 26.3.6 operators:
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator+(const complex<_Tp>& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator+(const complex<_Tp>& __x, const complex<_Tp>& __y)
 {
   complex<_Tp> __t(__x);
   __t += __y;
@@ -456,8 +455,7 @@ operator+(const complex<_Tp>& __x, const complex<_Tp>& __y)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator+(const complex<_Tp>& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator+(const complex<_Tp>& __x, const _Tp& __y)
 {
   complex<_Tp> __t(__x);
   __t += __y;
@@ -465,8 +463,7 @@ operator+(const complex<_Tp>& __x, const _Tp& __y)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator+(const _Tp& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator+(const _Tp& __x, const complex<_Tp>& __y)
 {
   complex<_Tp> __t(__y);
   __t += __x;
@@ -474,8 +471,7 @@ operator+(const _Tp& __x, const complex<_Tp>& __y)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator-(const complex<_Tp>& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator-(const complex<_Tp>& __x, const complex<_Tp>& __y)
 {
   complex<_Tp> __t(__x);
   __t -= __y;
@@ -483,8 +479,7 @@ operator-(const complex<_Tp>& __x, const complex<_Tp>& __y)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator-(const complex<_Tp>& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator-(const complex<_Tp>& __x, const _Tp& __y)
 {
   complex<_Tp> __t(__x);
   __t -= __y;
@@ -492,15 +487,14 @@ operator-(const complex<_Tp>& __x, const _Tp& __y)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator-(const _Tp& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator-(const _Tp& __x, const complex<_Tp>& __y)
 {
   complex<_Tp> __t(-__y);
   __t += __x;
   return __t;
 }
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX complex<_Tp>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX complex<_Tp>
 operator*(const complex<_Tp>& __z, const complex<_Tp>& __w)
 {
   _Tp __a = __z.real();
@@ -618,8 +612,7 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator*(const complex<_Tp>& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator*(const complex<_Tp>& __x, const _Tp& __y)
 {
   complex<_Tp> __t(__x);
   __t *= __y;
@@ -627,8 +620,7 @@ operator*(const complex<_Tp>& __x, const _Tp& __y)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator*(const _Tp& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator*(const _Tp& __x, const complex<_Tp>& __y)
 {
   complex<_Tp> __t(__y);
   __t *= __x;
@@ -636,7 +628,7 @@ operator*(const _Tp& __x, const complex<_Tp>& __y)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14_COMPLEX complex<_Tp>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX complex<_Tp>
 operator/(const complex<_Tp>& __z, const complex<_Tp>& __w)
 {
   int __ilogbw = 0;
@@ -740,15 +732,13 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator/(const complex<_Tp>& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator/(const complex<_Tp>& __x, const _Tp& __y)
 {
   return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp>
-operator/(const _Tp& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator/(const _Tp& __x, const complex<_Tp>& __y)
 {
   complex<_Tp> __t(__x);
   __t /= __y;
@@ -756,50 +746,50 @@ operator/(const _Tp& __x, const complex<_Tp>& __y)
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator+(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator+(const complex<_Tp>& __x)
 {
   return __x;
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator-(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> operator-(const complex<_Tp>& __x)
 {
   return complex<_Tp>(-__x.real(), -__x.imag());
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator==(const complex<_Tp>& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const complex<_Tp>& __x, const complex<_Tp>& __y)
 {
   return __x.real() == __y.real() && __x.imag() == __y.imag();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator==(const complex<_Tp>& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const complex<_Tp>& __x, const _Tp& __y)
 {
   return __x.real() == __y && __x.imag() == _Tp(0);
 }
 
 #if _CCCL_STD_VER <= 2017
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator==(const _Tp& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const _Tp& __x, const complex<_Tp>& __y)
 {
   return __x == __y.real() && _Tp(0) == __y.imag();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator!=(const complex<_Tp>& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const complex<_Tp>& __x, const complex<_Tp>& __y)
 {
   return !(__x == __y);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator!=(const complex<_Tp>& __x, const _Tp& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const complex<_Tp>& __x, const _Tp& __y)
 {
   return !(__x == __y);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator!=(const _Tp& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const _Tp& __x, const complex<_Tp>& __y)
 {
   return !(__x == __y);
 }
@@ -807,8 +797,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator!=(const _Tp& _
 
 #if !defined(_CCCL_COMPILER_NVRTC)
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator==(const complex<_Tp>& __x, const ::std::complex<_Up>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const complex<_Tp>& __x, const ::std::complex<_Up>& __y)
 {
   return __x.real() == _LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__y)
       && __x.imag() == _LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__y);
@@ -816,23 +805,20 @@ operator==(const complex<_Tp>& __x, const ::std::complex<_Up>& __y)
 
 #  if _CCCL_STD_VER <= 2017
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator==(const ::std::complex<_Up>& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const ::std::complex<_Up>& __x, const complex<_Tp>& __y)
 {
   return __y.real() == _LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__x)
       && __y.imag() == _LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__x);
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator!=(const complex<_Tp>& __x, const ::std::complex<_Up>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const complex<_Tp>& __x, const ::std::complex<_Up>& __y)
 {
   return !(__x == __y);
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator!=(const ::std::complex<_Up>& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const ::std::complex<_Up>& __x, const complex<_Tp>& __y)
 {
   return !(__x == __y);
 }
@@ -842,7 +828,7 @@ operator!=(const ::std::complex<_Up>& __x, const complex<_Tp>& __y)
 // real
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp real(const complex<_Tp>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp real(const complex<_Tp>& __c)
 {
   return __c.real();
 }
@@ -876,7 +862,7 @@ template <class _Tp>
 using __libcpp_complex_complex_type = typename __libcpp_complex_overload_traits<_Tp>::_ComplexType;
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> real(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> real(_Tp __re)
 {
   return __re;
 }
@@ -884,13 +870,13 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __libcpp_complex_valu
 // imag
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp imag(const complex<_Tp>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp imag(const complex<_Tp>& __c)
 {
   return __c.imag();
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> imag(_Tp)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> imag(_Tp)
 {
   return 0;
 }
@@ -898,7 +884,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __libcpp_complex_valu
 // abs
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _Tp abs(const complex<_Tp>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI _Tp abs(const complex<_Tp>& __c)
 {
   return _CUDA_VSTD::hypot(__c.real(), __c.imag());
 }
@@ -906,29 +892,28 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _Tp abs(const complex<_Tp>& __c)
 // arg
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _Tp arg(const complex<_Tp>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI _Tp arg(const complex<_Tp>& __c)
 {
   return _CUDA_VSTD::atan2(__c.imag(), __c.real());
 }
 
 #ifdef _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_same<_Tp, long double>::value, long double> arg(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_same<_Tp, long double>::value, long double> arg(_Tp __re)
 {
   return _CUDA_VSTD::atan2l(0.L, __re);
 }
 #endif // _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value || is_same<_Tp, double>::value, double>
-arg(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value || is_same<_Tp, double>::value, double> arg(_Tp __re)
 {
   // integrals need to be promoted to double
   return _CUDA_VSTD::atan2(0., static_cast<double>(__re));
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_same<_Tp, float>::value, float> arg(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_same<_Tp, float>::value, float> arg(_Tp __re)
 {
   return _CUDA_VSTD::atan2f(0.F, __re);
 }
@@ -936,7 +921,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_same<_Tp, float>::value, f
 // norm
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp norm(const complex<_Tp>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp norm(const complex<_Tp>& __c)
 {
   if (_CUDA_VSTD::__constexpr_isinf(__c.real()))
   {
@@ -950,7 +935,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Tp norm(const comple
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> norm(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> norm(_Tp __re)
 {
   return static_cast<__libcpp_complex_value_type<_Tp>>(__re) * __re;
 }
@@ -958,13 +943,13 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __libcpp_complex_valu
 // conj
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 complex<_Tp> conj(const complex<_Tp>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> conj(const complex<_Tp>& __c)
 {
   return complex<_Tp>(__c.real(), -__c.imag());
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __libcpp_complex_complex_type<_Tp> conj(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_complex_type<_Tp> conj(_Tp __re)
 {
   return __libcpp_complex_complex_type<_Tp>(__re);
 }
@@ -972,7 +957,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __libcpp_complex_comp
 // proj
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> proj(const complex<_Tp>& __c)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> proj(const complex<_Tp>& __c)
 {
   complex<_Tp> __r = __c;
   if (_CUDA_VSTD::__constexpr_isinf(__c.real()) || _CUDA_VSTD::__constexpr_isinf(__c.imag()))
@@ -983,7 +968,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> proj(const complex<_Tp>& __c)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<__is_complex_float<_Tp>::value, __libcpp_complex_complex_type<_Tp>>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<__is_complex_float<_Tp>::value, __libcpp_complex_complex_type<_Tp>>
 proj(_Tp __re)
 {
   if (_CUDA_VSTD::__constexpr_isinf(__re))
@@ -994,8 +979,7 @@ proj(_Tp __re)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value, __libcpp_complex_complex_type<_Tp>>
-proj(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<is_integral<_Tp>::value, __libcpp_complex_complex_type<_Tp>> proj(_Tp __re)
 {
   return __libcpp_complex_complex_type<_Tp>(__re);
 }
@@ -1003,7 +987,7 @@ proj(_Tp __re)
 // polar
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta = _Tp())
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta = _Tp())
 {
   if (_CUDA_VSTD::__constexpr_isnan(__rho) || _CUDA_VSTD::signbit(__rho))
   {
@@ -1041,7 +1025,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> polar(const _Tp& __rho, const _Tp& __
 // log
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> log(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> log(const complex<_Tp>& __x)
 {
   return complex<_Tp>(_CUDA_VSTD::log(_CUDA_VSTD::abs(__x)), _CUDA_VSTD::arg(__x));
 }
@@ -1049,7 +1033,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> log(const complex<_Tp>& __x)
 // log10
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> log10(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> log10(const complex<_Tp>& __x)
 {
   return _CUDA_VSTD::log(__x) / _CUDA_VSTD::log(_Tp(10));
 }
@@ -1057,7 +1041,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> log10(const complex<_Tp>& __x)
 // sqrt
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> sqrt(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> sqrt(const complex<_Tp>& __x)
 {
   if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
   {
@@ -1079,7 +1063,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> sqrt(const complex<_Tp>& __x)
 // exp
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> exp(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> exp(const complex<_Tp>& __x)
 {
   _Tp __i = __x.imag();
   if (__i == _Tp(0))
@@ -1111,7 +1095,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> exp(const complex<_Tp>& __x)
 // pow
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> pow(const complex<_Tp>& __x, const complex<_Tp>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> pow(const complex<_Tp>& __x, const complex<_Tp>& __y)
 {
   return _CUDA_VSTD::exp(__y * _CUDA_VSTD::log(__x));
 }
@@ -1120,22 +1104,21 @@ _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_MSVC(4244)
 
 template <class _Tp, class _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__common_type_t<_Tp, _Up>>
-pow(const complex<_Tp>& __x, const complex<_Up>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__common_type_t<_Tp, _Up>> pow(const complex<_Tp>& __x, const complex<_Up>& __y)
 {
   using __result_type = complex<__common_type_t<_Tp, _Up>>;
   return _CUDA_VSTD::pow(__result_type(__x), __result_type(__y));
 }
 
 template <class _Tp, class _Up, __enable_if_t<!__is_complex<_Up>::value, int> = 0>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__common_type_t<_Tp, _Up>> pow(const complex<_Tp>& __x, const _Up& __y)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__common_type_t<_Tp, _Up>> pow(const complex<_Tp>& __x, const _Up& __y)
 {
   using __result_type = complex<__common_type_t<_Tp, _Up>>;
   return _CUDA_VSTD::pow(__result_type(__x), __result_type(__y));
 }
 
 template <class _Tp, class _Up, __enable_if_t<!__is_complex<_Tp>::value, int> = 0>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<__common_type_t<_Tp, _Up>> pow(const _Tp& __x, const complex<_Up>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI complex<__common_type_t<_Tp, _Up>> pow(const _Tp& __x, const complex<_Up>& __y)
 {
   using __result_type = complex<__common_type_t<_Tp, _Up>>;
   return _CUDA_VSTD::pow(__result_type(__x, 0), __result_type(__y));
@@ -1146,7 +1129,7 @@ _CCCL_DIAG_POP
 // __sqr, computes pow(x, 2)
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> __sqr(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> __sqr(const complex<_Tp>& __x)
 {
   return complex<_Tp>((__x.real() - __x.imag()) * (__x.real() + __x.imag()), _Tp(2) * __x.real() * __x.imag());
 }
@@ -1154,7 +1137,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> __sqr(const complex<_Tp>& __x)
 // asinh
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> asinh(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> asinh(const complex<_Tp>& __x)
 {
   const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.)));
   if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
@@ -1192,7 +1175,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> asinh(const complex<_Tp>& __x)
 // acosh
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> acosh(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acosh(const complex<_Tp>& __x)
 {
   const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.)));
   if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
@@ -1237,7 +1220,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> acosh(const complex<_Tp>& __x)
 // atanh
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> atanh(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> atanh(const complex<_Tp>& __x)
 {
   const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.)));
   if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
@@ -1271,7 +1254,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> atanh(const complex<_Tp>& __x)
 // sinh
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> sinh(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> sinh(const complex<_Tp>& __x)
 {
   if (_CUDA_VSTD::__constexpr_isinf(__x.real()) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag()))
   {
@@ -1292,7 +1275,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> sinh(const complex<_Tp>& __x)
 // cosh
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> cosh(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x)
 {
   if (_CUDA_VSTD::__constexpr_isinf(__x.real()) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag()))
   {
@@ -1317,7 +1300,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> cosh(const complex<_Tp>& __x)
 // tanh
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> tanh(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> tanh(const complex<_Tp>& __x)
 {
   if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
   {
@@ -1346,7 +1329,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> tanh(const complex<_Tp>& __x)
 // asin
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> asin(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> asin(const complex<_Tp>& __x)
 {
   complex<_Tp> __z = _CUDA_VSTD::asinh(complex<_Tp>(-__x.imag(), __x.real()));
   return complex<_Tp>(__z.imag(), -__z.real());
@@ -1355,7 +1338,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> asin(const complex<_Tp>& __x)
 // acos
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> acos(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acos(const complex<_Tp>& __x)
 {
   const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.)));
   if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
@@ -1405,7 +1388,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> acos(const complex<_Tp>& __x)
 // atan
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> atan(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> atan(const complex<_Tp>& __x)
 {
   complex<_Tp> __z = _CUDA_VSTD::atanh(complex<_Tp>(-__x.imag(), __x.real()));
   return complex<_Tp>(__z.imag(), -__z.real());
@@ -1414,7 +1397,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> atan(const complex<_Tp>& __x)
 // sin
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> sin(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> sin(const complex<_Tp>& __x)
 {
   complex<_Tp> __z = _CUDA_VSTD::sinh(complex<_Tp>(-__x.imag(), __x.real()));
   return complex<_Tp>(__z.imag(), -__z.real());
@@ -1423,7 +1406,7 @@ _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> sin(const complex<_Tp>& __x)
 // cos
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> cos(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> cos(const complex<_Tp>& __x)
 {
   return _CUDA_VSTD::cosh(complex<_Tp>(-__x.imag(), __x.real()));
 }
@@ -1431,7 +1414,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> cos(const complex<_Tp>& __x)
 // tan
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY complex<_Tp> tan(const complex<_Tp>& __x)
+_LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> tan(const complex<_Tp>& __x)
 {
   complex<_Tp> __z = _CUDA_VSTD::tanh(complex<_Tp>(-__x.imag(), __x.real()));
   return complex<_Tp>(__z.imag(), -__z.real());
@@ -1470,52 +1453,51 @@ inline namespace complex_literals
 // NOTE: if you get a warning from GCC <7 here that "literal operator suffixes not preceded by ‘_’ are reserved for
 // future standardization" then we are sorry. The warning was implemented before GCC 7, but can only be disabled since
 // GCC 7. See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69523
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<long double> operator""il(long double __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<long double> operator""il(long double __im)
 {
   return {0.0l, __im};
 }
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<long double>
-operator""il(unsigned long long __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<long double> operator""il(unsigned long long __im)
 {
   return {0.0l, static_cast<long double>(__im)};
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<double> operator""i(long double __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<double> operator""i(long double __im)
 {
   return {0.0, static_cast<double>(__im)};
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<double> operator""i(unsigned long long __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<double> operator""i(unsigned long long __im)
 {
   return {0.0, static_cast<double>(__im)};
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<float> operator""if(long double __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<float> operator""if(long double __im)
 {
   return {0.0f, static_cast<float>(__im)};
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<float> operator""if(unsigned long long __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<float> operator""if(unsigned long long __im)
 {
   return {0.0f, static_cast<float>(__im)};
 }
 #  else
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<double> operator""i(double __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<double> operator""i(double __im)
 {
   return {0.0, static_cast<double>(__im)};
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<double> operator""i(unsigned long long __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<double> operator""i(unsigned long long __im)
 {
   return {0.0, static_cast<double>(__im)};
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<float> operator""if(double __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<float> operator""if(double __im)
 {
   return {0.0f, static_cast<float>(__im)};
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_INLINE_VISIBILITY constexpr complex<float> operator""if(unsigned long long __im)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr complex<float> operator""if(unsigned long long __im)
 {
   return {0.0f, static_cast<float>(__im)};
 }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/concepts b/libcudacxx/include/cuda/std/detail/libcxx/include/concepts
deleted file mode 100644
index 2499519726..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/concepts
+++ /dev/null
@@ -1,171 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_CONCEPTS
-#define _LIBCUDACXX_CONCEPTS
-
-/*
-    concepts synopsis
-namespace std {
-  // [concepts.lang], language-related concepts
-  // [concept.same], concept same_as
-  template<class T, class U>
-    concept same_as = see below;
-
-  // [concept.derived], concept derived_from
-  template<class Derived, class Base>
-    concept derived_from = see below;
-
-  // [concept.convertible], concept convertible_to
-  template<class From, class To>
-    concept convertible_to = see below;
-
-  // [concept.commonref], concept common_reference_with
-  template<class T, class U>
-    concept common_reference_with = see below;
-
-  // [concept.common], concept common_with
-  template<class T, class U>
-    concept common_with = see below;
-
-  // [concepts.arithmetic], arithmetic concepts
-  template<class T>
-    concept integral = see below;
-  template<class T>
-    concept signed_integral = see below;
-  template<class T>
-    concept unsigned_integral = see below;
-  template<class T>
-    concept floating_point = see below;
-
-  // [concept.assignable], concept assignable_from
-  template<class LHS, class RHS>
-    concept assignable_from = see below;
-
-  // [concept.swappable], concept swappable
-  namespace ranges {
-    inline namespace unspecified {
-      inline constexpr unspecified swap = unspecified;
-    }
-  }
-  template<class T>
-    concept swappable = see below;
-  template<class T, class U>
-    concept swappable_with = see below;
-
-  // [concept.destructible], concept destructible
-  template<class T>
-    concept destructible = see below;
-
-  // [concept.constructible], concept constructible_from
-  template<class T, class... Args>
-    concept constructible_from = see below;
-
-  // [concept.default.init], concept default_initializable
-  template<class T>
-    concept default_initializable = see below;
-
-  // [concept.moveconstructible], concept move_constructible
-  template<class T>
-    concept move_constructible = see below;
-
-  // [concept.copyconstructible], concept copy_constructible
-  template<class T>
-    concept copy_constructible = see below;
-
-  // [concept.equalitycomparable], concept equality_comparable
-  template<class T>
-    concept equality_comparable = see below;
-  template<class T, class U>
-    concept equality_comparable_with = see below;
-
-  // [concept.totallyordered], concept totally_ordered
-  template<class T>
-    concept totally_ordered = see below;
-  template<class T, class U>
-    concept totally_ordered_with = see below;
-
-  // [concepts.object], object concepts
-  template<class T>
-    concept movable = see below;
-  template<class T>
-    concept copyable = see below;
-  template<class T>
-    concept semiregular = see below;
-  template<class T>
-    concept regular = see below;
-
-  // [concepts.callable], callable concepts
-  // [concept.invocable], concept invocable
-  template<class F, class... Args>
-    concept invocable = see below;
-
-  // [concept.regularinvocable], concept regular_invocable
-  template<class F, class... Args>
-    concept regular_invocable = see below;
-
-  // [concept.predicate], concept predicate
-  template<class F, class... Args>
-    concept predicate = see below;
-
-  // [concept.relation], concept relation
-  template<class R, class T, class U>
-    concept relation = see below;
-
-  // [concept.equiv], concept equivalence_relation
-  template<class R, class T, class U>
-    concept equivalence_relation = see below;
-
-  // [concept.strictweakorder], concept strict_weak_order
-  template<class R, class T, class U>
-    concept strict_weak_order = see below;
-}
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__concepts/__concept_macros.h>
-#include <cuda/std/__concepts/_One_of.h>
-#include <cuda/std/__concepts/all_of.h>
-#include <cuda/std/__concepts/arithmetic.h>
-#include <cuda/std/__concepts/assignable.h>
-#include <cuda/std/__concepts/boolean_testable.h>
-#include <cuda/std/__concepts/class_or_enum.h>
-#include <cuda/std/__concepts/common_reference_with.h>
-#include <cuda/std/__concepts/common_with.h>
-#include <cuda/std/__concepts/constructible.h>
-#include <cuda/std/__concepts/convertible_to.h>
-#include <cuda/std/__concepts/copyable.h>
-#include <cuda/std/__concepts/derived_from.h>
-#include <cuda/std/__concepts/destructible.h>
-#include <cuda/std/__concepts/different_from.h>
-#include <cuda/std/__concepts/equality_comparable.h>
-#include <cuda/std/__concepts/invocable.h>
-#include <cuda/std/__concepts/movable.h>
-#include <cuda/std/__concepts/predicate.h>
-#include <cuda/std/__concepts/regular.h>
-#include <cuda/std/__concepts/relation.h>
-#include <cuda/std/__concepts/same_as.h>
-#include <cuda/std/__concepts/semiregular.h>
-#include <cuda/std/__concepts/swappable.h>
-#include <cuda/std/__concepts/totally_ordered.h>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/version>
-
-#endif // _LIBCUDACXX_CONCEPTS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef b/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef
index 749931900f..ee109ef536 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cstddef
@@ -77,74 +77,74 @@ enum class byte : unsigned char
 {
 };
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr byte operator|(byte __lhs, byte __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr byte operator|(byte __lhs, byte __rhs) noexcept
 {
   return static_cast<byte>(
     static_cast<unsigned char>(static_cast<unsigned int>(__lhs) | static_cast<unsigned int>(__rhs)));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr byte& operator|=(byte& __lhs, byte __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr byte& operator|=(byte& __lhs, byte __rhs) noexcept
 {
   return __lhs = __lhs | __rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr byte operator&(byte __lhs, byte __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr byte operator&(byte __lhs, byte __rhs) noexcept
 {
   return static_cast<byte>(
     static_cast<unsigned char>(static_cast<unsigned int>(__lhs) & static_cast<unsigned int>(__rhs)));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr byte& operator&=(byte& __lhs, byte __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr byte& operator&=(byte& __lhs, byte __rhs) noexcept
 {
   return __lhs = __lhs & __rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr byte operator^(byte __lhs, byte __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr byte operator^(byte __lhs, byte __rhs) noexcept
 {
   return static_cast<byte>(
     static_cast<unsigned char>(static_cast<unsigned int>(__lhs) ^ static_cast<unsigned int>(__rhs)));
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr byte& operator^=(byte& __lhs, byte __rhs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr byte& operator^=(byte& __lhs, byte __rhs) noexcept
 {
   return __lhs = __lhs ^ __rhs;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr byte operator~(byte __b) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr byte operator~(byte __b) noexcept
 {
   return static_cast<byte>(static_cast<unsigned char>(~static_cast<unsigned int>(__b)));
 }
 
 template <class _Integer>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<is_integral_v<_Integer>, byte>&
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_integral_v<_Integer>, byte>&
 operator<<=(byte& __lhs, _Integer __shift) noexcept
 {
   return __lhs = __lhs << __shift;
 }
 
 template <class _Integer>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<is_integral_v<_Integer>, byte>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_integral_v<_Integer>, byte>
 operator<<(byte __lhs, _Integer __shift) noexcept
 {
   return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) << __shift));
 }
 
 template <class _Integer>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<is_integral_v<_Integer>, byte>&
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_integral_v<_Integer>, byte>&
 operator>>=(byte& __lhs, _Integer __shift) noexcept
 {
   return __lhs = __lhs >> __shift;
 }
 
 template <class _Integer>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<is_integral_v<_Integer>, byte>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_integral_v<_Integer>, byte>
 operator>>(byte __lhs, _Integer __shift) noexcept
 {
   return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) >> __shift));
 }
 
 template <class _Integer>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<is_integral_v<_Integer>, _Integer> to_integer(byte __b) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<is_integral_v<_Integer>, _Integer> to_integer(byte __b) noexcept
 {
   return static_cast<_Integer>(__b);
 }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib b/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib
index 6b9153f220..58e64ddf0e 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cstdlib
@@ -110,7 +110,7 @@ _CCCL_PUSH_MACROS
 #else // ^^^ __CUDA_ARCH__ ^^^ / vvv !__CUDA_ARCH__ vvv
 #  if defined(_CCCL_COMPILER_MSVC_2017)
 template <class = void>
-_LIBCUDACXX_INLINE_VISIBILITY __declspec(noreturn) void __unreachable_fallback()
+_LIBCUDACXX_HIDE_FROM_ABI __declspec(noreturn) void __unreachable_fallback()
 {
   __assume(0);
 }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/functional b/libcudacxx/include/cuda/std/detail/libcxx/include/functional
deleted file mode 100644
index 0c83d9c8d9..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/functional
+++ /dev/null
@@ -1,552 +0,0 @@
-// -*- C++ -*-
-//===------------------------ functional ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_FUNCTIONAL
-#define _LIBCUDACXX_FUNCTIONAL
-
-/*
-    functional synopsis
-
-namespace std
-{
-
-template <class Arg, class Result>
-struct unary_function
-{
-    typedef Arg    argument_type;
-    typedef Result result_type;
-};
-
-template <class Arg1, class Arg2, class Result>
-struct binary_function
-{
-    typedef Arg1   first_argument_type;
-    typedef Arg2   second_argument_type;
-    typedef Result result_type;
-};
-
-template <class T>
-class reference_wrapper
-    : public unary_function<T1, R> // if wrapping a unary functor
-    : public binary_function<T1, T2, R> // if wraping a binary functor
-{
-public:
-    // types
-    typedef T type;
-    typedef see below result_type; // Not always defined
-
-    // construct/copy/destroy
-    reference_wrapper(T&) noexcept;
-    reference_wrapper(T&&) = delete; // do not bind to temps
-    reference_wrapper(const reference_wrapper<T>& x) noexcept;
-
-    // assignment
-    reference_wrapper& operator=(const reference_wrapper<T>& x) noexcept;
-
-    // access
-    operator T& () const noexcept;
-    T& get() const noexcept;
-
-    // invoke
-    template <class... ArgTypes>
-      typename result_of<T&(ArgTypes&&...)>::type
-          operator() (ArgTypes&&...) const;
-};
-
-template <class T> reference_wrapper<T> ref(T& t) noexcept;
-template <class T> void ref(const T&& t) = delete;
-template <class T> reference_wrapper<T> ref(reference_wrapper<T>t) noexcept;
-
-template <class T> reference_wrapper<const T> cref(const T& t) noexcept;
-template <class T> void cref(const T&& t) = delete;
-template <class T> reference_wrapper<const T> cref(reference_wrapper<T> t) noexcept;
-
-template <class T> struct unwrap_reference;                                       // since C++20
-template <class T> struct unwrap_ref_decay : unwrap_reference<decay_t<T>> { };    // since C++20
-template <class T> using unwrap_reference_t = typename unwrap_reference<T>::type; // since C++20
-template <class T> using unwrap_ref_decay_t = typename unwrap_ref_decay<T>::type; // since C++20
-
-template <class T> // <class T=void> in C++14
-struct plus : binary_function<T, T, T>
-{
-    T operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct minus : binary_function<T, T, T>
-{
-    T operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct multiplies : binary_function<T, T, T>
-{
-    T operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct divides : binary_function<T, T, T>
-{
-    T operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct modulus : binary_function<T, T, T>
-{
-    T operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct negate : unary_function<T, T>
-{
-    T operator()(const T& x) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct equal_to : binary_function<T, T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct not_equal_to : binary_function<T, T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct greater : binary_function<T, T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct less : binary_function<T, T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct greater_equal : binary_function<T, T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct less_equal : binary_function<T, T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct logical_and : binary_function<T, T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct logical_or : binary_function<T, T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct logical_not : unary_function<T, bool>
-{
-    bool operator()(const T& x) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct bit_and : unary_function<T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct bit_or : unary_function<T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T> // <class T=void> in C++14
-struct bit_xor : unary_function<T, bool>
-{
-    bool operator()(const T& x, const T& y) const;
-};
-
-template <class T=void> // C++14
-struct bit_xor : unary_function<T, bool>
-{
-    bool operator()(const T& x) const;
-};
-
-template <class Predicate>
-class unary_negate // deprecated in C++17
-    : public unary_function<typename Predicate::argument_type, bool>
-{
-public:
-    explicit unary_negate(const Predicate& pred);
-    bool operator()(const typename Predicate::argument_type& x) const;
-};
-
-template <class Predicate> // deprecated in C++17
-unary_negate<Predicate> not1(const Predicate& pred);
-
-template <class Predicate>
-class binary_negate // deprecated in C++17
-    : public binary_function<typename Predicate::first_argument_type,
-                             typename Predicate::second_argument_type,
-                             bool>
-{
-public:
-    explicit binary_negate(const Predicate& pred);
-    bool operator()(const typename Predicate::first_argument_type& x,
-                    const typename Predicate::second_argument_type& y) const;
-};
-
-template <class Predicate> // deprecated in C++17
-binary_negate<Predicate> not2(const Predicate& pred);
-
-template <class F> unspecified not_fn(F&& f); // C++17
-
-template<class T> struct is_bind_expression;
-template<class T> struct is_placeholder;
-
-    // See C++14 20.9.9, Function object binders
-template <class T> inline constexpr bool is_bind_expression_v
-  = is_bind_expression<T>::value; // C++17
-template <class T> inline constexpr int is_placeholder_v
-  = is_placeholder<T>::value; // C++17
-
-
-template<class Fn, class... BoundArgs>
-  unspecified bind(Fn&&, BoundArgs&&...);
-template<class R, class Fn, class... BoundArgs>
-  unspecified bind(Fn&&, BoundArgs&&...);
-
-template<class F, class... Args>
- invoke_result_t<F, Args...> invoke(F&& f, Args&&... args) // C++17
-    noexcept(is_nothrow_invocable_v<F, Args...>);
-
-namespace placeholders {
-  // M is the implementation-defined number of placeholders
-  extern unspecified _1;
-  extern unspecified _2;
-  .
-  .
-  .
-  extern unspecified _Mp;
-}
-
-template <class Operation>
-class binder1st     // deprecated in C++11, removed in C++17
-    : public unary_function<typename Operation::second_argument_type,
-                            typename Operation::result_type>
-{
-protected:
-    Operation                               op;
-    typename Operation::first_argument_type value;
-public:
-    binder1st(const Operation& x, const typename Operation::first_argument_type y);
-    typename Operation::result_type operator()(      typename Operation::second_argument_type& x) const;
-    typename Operation::result_type operator()(const typename Operation::second_argument_type& x) const;
-};
-
-template <class Operation, class T>
-binder1st<Operation> bind1st(const Operation& op, const T& x);  // deprecated in C++11, removed in C++17
-
-template <class Operation>
-class binder2nd     // deprecated in C++11, removed in C++17
-    : public unary_function<typename Operation::first_argument_type,
-                            typename Operation::result_type>
-{
-protected:
-    Operation                                op;
-    typename Operation::second_argument_type value;
-public:
-    binder2nd(const Operation& x, const typename Operation::second_argument_type y);
-    typename Operation::result_type operator()(      typename Operation::first_argument_type& x) const;
-    typename Operation::result_type operator()(const typename Operation::first_argument_type& x) const;
-};
-
-template <class Operation, class T>
-binder2nd<Operation> bind2nd(const Operation& op, const T& x);  // deprecated in C++11, removed in C++17
-
-template <class Arg, class Result>      // deprecated in C++11, removed in C++17
-class pointer_to_unary_function : public unary_function<Arg, Result>
-{
-public:
-    explicit pointer_to_unary_function(Result (*f)(Arg));
-    Result operator()(Arg x) const;
-};
-
-template <class Arg, class Result>
-pointer_to_unary_function<Arg,Result> ptr_fun(Result (*f)(Arg));      // deprecated in C++11, removed in C++17
-
-template <class Arg1, class Arg2, class Result>      // deprecated in C++11, removed in C++17
-class pointer_to_binary_function : public binary_function<Arg1, Arg2, Result>
-{
-public:
-    explicit pointer_to_binary_function(Result (*f)(Arg1, Arg2));
-    Result operator()(Arg1 x, Arg2 y) const;
-};
-
-template <class Arg1, class Arg2, class Result>
-pointer_to_binary_function<Arg1,Arg2,Result> ptr_fun(Result (*f)(Arg1,Arg2));      // deprecated in C++11, removed in
-C++17
-
-template<class S, class T>      // deprecated in C++11, removed in C++17
-class mem_fun_t : public unary_function<T*, S>
-{
-public:
-    explicit mem_fun_t(S (T::*p)());
-    S operator()(T* p) const;
-};
-
-template<class S, class T, class A>
-class mem_fun1_t : public binary_function<T*, A, S>      // deprecated in C++11, removed in C++17
-{
-public:
-    explicit mem_fun1_t(S (T::*p)(A));
-    S operator()(T* p, A x) const;
-};
-
-template<class S, class T>          mem_fun_t<S,T>    mem_fun(S (T::*f)());      // deprecated in C++11, removed in
-C++17 template<class S, class T, class A> mem_fun1_t<S,T,A> mem_fun(S (T::*f)(A));     // deprecated in C++11, removed
-in C++17
-
-template<class S, class T>
-class mem_fun_ref_t : public unary_function<T, S>      // deprecated in C++11, removed in C++17
-{
-public:
-    explicit mem_fun_ref_t(S (T::*p)());
-    S operator()(T& p) const;
-};
-
-template<class S, class T, class A>
-class mem_fun1_ref_t : public binary_function<T, A, S>      // deprecated in C++11, removed in C++17
-{
-public:
-    explicit mem_fun1_ref_t(S (T::*p)(A));
-    S operator()(T& p, A x) const;
-};
-
-template<class S, class T>          mem_fun_ref_t<S,T>    mem_fun_ref(S (T::*f)());      // deprecated in C++11, removed
-in C++17 template<class S, class T, class A> mem_fun1_ref_t<S,T,A> mem_fun_ref(S (T::*f)(A));     // deprecated in
-C++11, removed in C++17
-
-template <class S, class T>
-class const_mem_fun_t : public unary_function<const T*, S>      // deprecated in C++11, removed in C++17
-{
-public:
-    explicit const_mem_fun_t(S (T::*p)() const);
-    S operator()(const T* p) const;
-};
-
-template <class S, class T, class A>
-class const_mem_fun1_t : public binary_function<const T*, A, S>      // deprecated in C++11, removed in C++17
-{
-public:
-    explicit const_mem_fun1_t(S (T::*p)(A) const);
-    S operator()(const T* p, A x) const;
-};
-
-template <class S, class T>          const_mem_fun_t<S,T>    mem_fun(S (T::*f)() const);      // deprecated in C++11,
-removed in C++17 template <class S, class T, class A> const_mem_fun1_t<S,T,A> mem_fun(S (T::*f)(A) const);     //
-deprecated in C++11, removed in C++17
-
-template <class S, class T>
-class const_mem_fun_ref_t : public unary_function<T, S>      // deprecated in C++11, removed in C++17
-{
-public:
-    explicit const_mem_fun_ref_t(S (T::*p)() const);
-    S operator()(const T& p) const;
-};
-
-template <class S, class T, class A>
-class const_mem_fun1_ref_t : public binary_function<T, A, S>      // deprecated in C++11, removed in C++17
-{
-public:
-    explicit const_mem_fun1_ref_t(S (T::*p)(A) const);
-    S operator()(const T& p, A x) const;
-};
-
-template <class S, class T>          const_mem_fun_ref_t<S,T>    mem_fun_ref(S (T::*f)() const);   // deprecated in
-C++11, removed in C++17 template <class S, class T, class A> const_mem_fun1_ref_t<S,T,A> mem_fun_ref(S (T::*f)(A)
-const);  // deprecated in C++11, removed in C++17
-
-template<class R, class T> unspecified mem_fn(R T::*);
-
-class bad_function_call
-    : public exception
-{
-};
-
-template<class> class function; // undefined
-
-template<class R, class... ArgTypes>
-class function<R(ArgTypes...)>
-  : public unary_function<T1, R>      // iff sizeof...(ArgTypes) == 1 and
-                                      // ArgTypes contains T1
-  : public binary_function<T1, T2, R> // iff sizeof...(ArgTypes) == 2 and
-                                      // ArgTypes contains T1 and T2
-{
-public:
-    typedef R result_type;
-
-    // construct/copy/destroy:
-    function() noexcept;
-    function(nullptr_t) noexcept;
-    function(const function&);
-    function(function&&) noexcept;
-    template<class F>
-      function(F);
-    template<Allocator Alloc>
-      function(allocator_arg_t, const Alloc&) noexcept;            // removed in C++17
-    template<Allocator Alloc>
-      function(allocator_arg_t, const Alloc&, nullptr_t) noexcept; // removed in C++17
-    template<Allocator Alloc>
-      function(allocator_arg_t, const Alloc&, const function&);    // removed in C++17
-    template<Allocator Alloc>
-      function(allocator_arg_t, const Alloc&, function&&);         // removed in C++17
-    template<class F, Allocator Alloc>
-      function(allocator_arg_t, const Alloc&, F);                  // removed in C++17
-
-    function& operator=(const function&);
-    function& operator=(function&&) noexcept;
-    function& operator=(nullptr_t) noexcept;
-    template<class F>
-      function& operator=(F&&);
-    template<class F>
-      function& operator=(reference_wrapper<F>) noexcept;
-
-    ~function();
-
-    // function modifiers:
-    void swap(function&) noexcept;
-    template<class F, class Alloc>
-      void assign(F&&, const Alloc&);                 // Removed in C++17
-
-    // function capacity:
-    explicit operator bool() const noexcept;
-
-    // function invocation:
-    R operator()(ArgTypes...) const;
-
-    // function target access:
-    const std::type_info& target_type() const noexcept;
-    template <typename T>       T* target() noexcept;
-    template <typename T> const T* target() const noexcept;
-};
-
-// Deduction guides
-template<class R, class ...Args>
-function(R(*)(Args...)) -> function<R(Args...)>; // since C++17
-
-template<class F>
-function(F) -> function<see-below>; // since C++17
-
-// Null pointer comparisons:
-template <class R, class ... ArgTypes>
-  bool operator==(const function<R(ArgTypes...)>&, nullptr_t) noexcept;
-
-template <class R, class ... ArgTypes>
-  bool operator==(nullptr_t, const function<R(ArgTypes...)>&) noexcept;
-
-template <class R, class ... ArgTypes>
-  bool operator!=(const function<R(ArgTypes...)>&, nullptr_t) noexcept;
-
-template <class  R, class ... ArgTypes>
-  bool operator!=(nullptr_t, const function<R(ArgTypes...)>&) noexcept;
-
-// specialized algorithms:
-template <class  R, class ... ArgTypes>
-  void swap(function<R(ArgTypes...)>&, function<R(ArgTypes...)>&) noexcept;
-
-template <class T> struct hash;
-
-template <> struct hash<bool>;
-template <> struct hash<char>;
-template <> struct hash<signed char>;
-template <> struct hash<unsigned char>;
-template <> struct hash<char16_t>;
-template <> struct hash<char32_t>;
-template <> struct hash<wchar_t>;
-template <> struct hash<short>;
-template <> struct hash<unsigned short>;
-template <> struct hash<int>;
-template <> struct hash<unsigned int>;
-template <> struct hash<long>;
-template <> struct hash<long long>;
-template <> struct hash<unsigned long>;
-template <> struct hash<unsigned long long>;
-
-template <> struct hash<float>;
-template <> struct hash<double>;
-template <> struct hash<long double>;
-
-template<class T> struct hash<T*>;
-template <> struct hash<nullptr_t>;  // C++17
-
-}  // std
-
-POLICY:  For non-variadic implementations, the number of arguments is limited
-         to 3.  It is hoped that the need for non-variadic implementations
-         will be minimal.
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#ifdef __cuda_std__
-#  ifndef _CCCL_COMPILER_NVRTC
-#    include <errno.h>
-#  endif // _CCCL_COMPILER_NVRTC
-#endif // __cuda_std__
-
-#include <cuda/std/__functional/binary_function.h>
-#include <cuda/std/__functional/binary_negate.h>
-#include <cuda/std/__functional/bind.h>
-#include <cuda/std/__functional/bind_back.h>
-#include <cuda/std/__functional/bind_front.h>
-#include <cuda/std/__functional/binder1st.h>
-#include <cuda/std/__functional/binder2nd.h>
-#include <cuda/std/__functional/compose.h>
-#include <cuda/std/__functional/default_searcher.h>
-#include <cuda/std/__functional/function.h>
-#include <cuda/std/__functional/hash.h>
-#include <cuda/std/__functional/identity.h>
-#include <cuda/std/__functional/invoke.h>
-#include <cuda/std/__functional/is_transparent.h>
-#include <cuda/std/__functional/mem_fn.h>
-#include <cuda/std/__functional/mem_fun_ref.h>
-#include <cuda/std/__functional/not_fn.h>
-#include <cuda/std/__functional/operations.h>
-#include <cuda/std/__functional/perfect_forward.h>
-#include <cuda/std/__functional/pointer_to_binary_function.h>
-#include <cuda/std/__functional/pointer_to_unary_function.h>
-#include <cuda/std/__functional/ranges_operations.h>
-#include <cuda/std/__functional/reference_wrapper.h>
-#include <cuda/std/__functional/unary_function.h>
-#include <cuda/std/__functional/unary_negate.h>
-#include <cuda/std/__functional/unwrap_ref.h>
-#include <cuda/std/__functional/weak_result_type.h>
-#include <cuda/std/detail/libcxx/include/__functional_base>
-#include <cuda/std/detail/libcxx/include/iosfwd> // for forward declarations of vector and string.
-#include <cuda/std/version>
-
-#endif // _LIBCUDACXX_FUNCTIONAL
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/initializer_list b/libcudacxx/include/cuda/std/detail/libcxx/include/initializer_list
deleted file mode 100644
index 7c4d1203f9..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/initializer_list
+++ /dev/null
@@ -1,63 +0,0 @@
-// -*- C++ -*-
-//===----------------------- initializer_list -----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_INITIALIZER_LIST
-#define _LIBCUDACXX_INITIALIZER_LIST
-
-/*
-    initializer_list synopsis
-
-namespace std
-{
-
-template<class E>
-class initializer_list
-{
-public:
-    typedef E        value_type;
-    typedef const E& reference;
-    typedef const E& const_reference;
-    typedef size_t   size_type;
-
-    typedef const E* iterator;
-    typedef const E* const_iterator;
-
-    initializer_list() noexcept; // constexpr in C++14
-
-    size_t   size()  const noexcept; // constexpr in C++14
-    const E* begin() const noexcept; // constexpr in C++14
-    const E* end()   const noexcept; // constexpr in C++14
-};
-
-template<class E> const E* begin(initializer_list<E> il) noexcept; // constexpr in C++14
-template<class E> const E* end(initializer_list<E> il) noexcept; // constexpr in C++14
-
-}  // std
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#if !defined(_CCCL_COMPILER_NVRTC)
-#  include <initializer_list>
-#endif // !_CCCL_COMPILER_NVRTC
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-using ::std::initializer_list;
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#endif // _LIBCUDACXX_INITIALIZER_LIST
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/iosfwd b/libcudacxx/include/cuda/std/detail/libcxx/include/iosfwd
index 833fdb1be0..7f9f4a2665 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/iosfwd
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/iosfwd
@@ -107,42 +107,42 @@ typedef fpos<char_traits<wchar_t>::state_type> wstreampos;
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-class _LIBCUDACXX_TYPE_VIS ios_base;
+class _CCCL_TYPE_VISIBILITY_DEFAULT ios_base;
 
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_ios;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_ios;
 
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_streambuf;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_streambuf;
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_istream;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_istream;
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_ostream;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_ostream;
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_iostream;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_iostream;
 
 template <class _CharT, class _Traits = char_traits<_CharT>, class _Allocator = allocator<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_stringbuf;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_stringbuf;
 template <class _CharT, class _Traits = char_traits<_CharT>, class _Allocator = allocator<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_istringstream;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_istringstream;
 template <class _CharT, class _Traits = char_traits<_CharT>, class _Allocator = allocator<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_ostringstream;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_ostringstream;
 template <class _CharT, class _Traits = char_traits<_CharT>, class _Allocator = allocator<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_stringstream;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_stringstream;
 
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_filebuf;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_filebuf;
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_ifstream;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_ifstream;
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_ofstream;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_ofstream;
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS basic_fstream;
+class _CCCL_TYPE_VISIBILITY_DEFAULT basic_fstream;
 
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS istreambuf_iterator;
+class _CCCL_TYPE_VISIBILITY_DEFAULT istreambuf_iterator;
 template <class _CharT, class _Traits = char_traits<_CharT>>
-class _LIBCUDACXX_TEMPLATE_VIS ostreambuf_iterator;
+class _CCCL_TYPE_VISIBILITY_DEFAULT ostreambuf_iterator;
 
 typedef basic_ios<char> ios;
 typedef basic_ios<wchar_t> wios;
@@ -179,7 +179,7 @@ typedef basic_fstream<wchar_t> wfstream;
 
 #if !defined(_LIBCUDACXX_HAS_NO_WCHAR_H)
 template <class _State>
-class _LIBCUDACXX_TEMPLATE_VIS fpos;
+class _CCCL_TYPE_VISIBILITY_DEFAULT fpos;
 typedef fpos<mbstate_t> streampos;
 typedef fpos<mbstate_t> wstreampos;
 #  ifndef _LIBCUDACXX_NO_HAS_CHAR8_T
@@ -200,7 +200,7 @@ typedef long long streamoff; // for char_traits in <string>
 
 // Include other forward declarations here
 template <class _Tp, class _Alloc = allocator<_Tp>>
-class _LIBCUDACXX_TEMPLATE_VIS vector;
+class _CCCL_TYPE_VISIBILITY_DEFAULT vector;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/iterator b/libcudacxx/include/cuda/std/detail/libcxx/include/iterator
deleted file mode 100644
index 7d63f43106..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/iterator
+++ /dev/null
@@ -1,763 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_ITERATOR
-#define _LIBCUDACXX_ITERATOR
-
-/*
-    iterator synopsis
-
-#include <concepts>
-
-namespace std
-{
-template<class> struct incrementable_traits;       // since C++20
-template<class T>
-  using iter_difference_t = see below;             // since C++20
-
-template<class> struct indirectly_readable_traits; // since C++20
-template<class T>
-  using iter_value_t = see below;                  // since C++20
-
-template<class Iterator>
-struct iterator_traits;
-
-template<class T>
-  requires is_object_v<T>                    // since C++20
-struct iterator_traits<T*>;
-
-template<dereferenceable T>
-  using iter_reference_t = decltype(*declval<T&>());
-
-namespace ranges::inline unspecified {
-    inline constexpr unspecified iter_move = unspecified; // since C++20, nodiscard as an extension
-}}
-
-template<dereferenceable T>
-  requires ...
-using iter_rvalue_reference_t = decltype(ranges::iter_move(declval<T&>())); // since C++20
-
-// [iterator.concepts], iterator concepts
-// [iterator.concept.readable], concept indirectly_readable
-template<class In>
-  concept indirectly_readable = see below;                      // since C++20
-
-template<indirectly_readable T>
-  using iter_common_reference_t =
-    common_reference_t<iter_reference_t<T>, iter_value_t<T>&>;  // since C++20
-
-// [iterator.concept.writable], concept indirectly_writable
-template<class Out, class T>
-  concept indirectly_writable = see below;                // since C++20
-
-// [iterator.concept.winc], concept weakly_incrementable
-template<class I>
-  concept weakly_incrementable = see below;                // since C++20
-
-// [iterator.concept.inc], concept incrementable
-template<class I>
-  concept incrementable = see below;                       // since C++20
-
-// [iterator.concept.iterator], concept input_or_output_iterator
-  template<class I>
-    concept input_or_output_iterator = see below;          // since C++20
-
-// [iterator.concept.sentinel], concept sentinel_for
-template<class S, class I>
-  concept sentinel_for = see below;                        // since C++20
-
-// [iterator.concept.sizedsentinel], concept sized_sentinel_for
-template<class S, class I>
-  inline constexpr bool disable_sized_sentinel_for = false;
-
-template<class S, class I>
-  concept sized_sentinel_for = see below;
-
-// [iterator.concept.input], concept input_iterator
-template<class I>
-  concept input_iterator = see below;                      // since C++20
-
-// [iterator.concept.output], concept output_iterator
-template<class I, class T>
-  concept output_iterator = see below;                     // since C++20
-
-// [iterator.concept.forward], concept forward_iterator
-template<class I>
-  concept forward_iterator = see below;                    // since C++20
-
-// [iterator.concept.bidir], concept bidirectional_iterator
-template<class I>
-  concept bidirectional_iterator = see below;              // since C++20
-
-// [iterator.concept.random.access], concept random_access_iterator
-template<class I>
-  concept random_access_iterator = see below;              // since C++20
-
-// [indirectcallable]
-// [indirectcallable.indirectinvocable]
-template<class F, class I>
-  concept indirectly_unary_invocable = see below;          // since C++20
-
-template<class F, class I>
-  concept indirectly_regular_unary_invocable = see below;  // since C++20
-
-template<class F, class I>
-  concept indirect_unary_predicate = see below;            // since C++20
-
-template<class F, class I1, class I2>
-  concept indirect_binary_predicate = see below;           // since C++20
-
-template<class F, class I1, class I2 = I1>
-  concept indirect_equivalence_relation = see below;       // since C++20
-
-template<class F, class I1, class I2 = I1>
-  concept indirect_strict_weak_order = see below;          // since C++20
-
-template<class F, class... Is>
-  using indirect_result_t = see below;                     // since C++20
-
-// [projected], projected
-template<indirectly_readable I, indirectly_regular_unary_invocable<I> Proj>
-  struct projected;                                        // since C++20
-
-template<weakly_incrementable I, indirectly_regular_unary_invocable<I> Proj>
-  struct incrementable_traits<projected<I, Proj>>;         // since C++20
-
-// [alg.req.ind.move], concept indirectly_movable
-template<class In, class Out>
-  concept indirectly_movable = see below;                  // since C++20
-
-template<class In, class Out>
-  concept indirectly_movable_storable = see below;         // since C++20
-
-// [alg.req.ind.copy], concept indirectly_copyable
-template<class In, class Out>
-  concept indirectly_copyable = see below;                 // since C++20
-
-template<class In, class Out>
-  concept indirectly_copyable_storable = see below;        // since C++20
-
-// [alg.req.ind.swap], concept indirectly_swappable
-template<class I1, class I2 = I1>
-  concept indirectly_swappable = see below;                // since C++20
-
-template<class I1, class I2, class R, class P1 = identity,
-         class P2 = identity>
-  concept indirectly_comparable =
-    indirect_binary_predicate<R, projected<I1, P1>, projected<I2, P2>>; // since C++20
-
-// [alg.req.permutable], concept permutable
-template<class I>
-  concept permutable = see below;                          // since C++20
-
- // [alg.req.mergeable], concept mergeable
-template<class I1, class I2, class Out,
-    class R = ranges::less, class P1 = identity, class P2 = identity>
-  concept mergeable = see below;                           // since C++20
-
-// [alg.req.sortable], concept sortable
-template<class I, class R = ranges::less, class P = identity>
-  concept sortable = see below;                            // since C++20
-
-template<input_or_output_iterator I, sentinel_for<I> S>
-  requires (!same_as<I, S> && copyable<I>)
-class common_iterator;                                     // since C++20
-
-template<class Category, class T, class Distance = ptrdiff_t,
-         class Pointer = T*, class Reference = T&>
-struct iterator                                            // deprecated in C++17
-{
-    typedef T         value_type;
-    typedef Distance  difference_type;
-    typedef Pointer   pointer;
-    typedef Reference reference;
-    typedef Category  iterator_category;
-};
-
-struct input_iterator_tag  {};
-struct output_iterator_tag {};
-struct forward_iterator_tag       : public input_iterator_tag         {};
-struct bidirectional_iterator_tag : public forward_iterator_tag       {};
-struct random_access_iterator_tag : public bidirectional_iterator_tag {};
-struct contiguous_iterator_tag    : public random_access_iterator_tag {};
-
-// 27.4.3, iterator operations
-template <class InputIterator, class Distance>  // constexpr in C++17
-  constexpr void advance(InputIterator& i, Distance n);
-
-template <class InputIterator>  // constexpr in C++17
-  constexpr typename iterator_traits<InputIterator>::difference_type
-    distance(InputIterator first, InputIterator last);
-
-template <class InputIterator>  // constexpr in C++17
-  constexpr InputIterator next(InputIterator x,
-typename iterator_traits<InputIterator>::difference_type n = 1);
-
-template <class BidirectionalIterator>  // constexpr in C++17
-  constexpr BidirectionalIterator prev(BidirectionalIterator x,
-    typename iterator_traits<BidirectionalIterator>::difference_type n = 1);
-
-// [range.iter.ops], range iterator operations
-namespace ranges {
-  // [range.iter.op.advance], ranges::advance
-  template<input_or_output_iterator I>
-    constexpr void advance(I& i, iter_difference_t<I> n);                          // since C++20
-  template<input_or_output_iterator I, sentinel_for<I> S>
-    constexpr void advance(I& i, S bound);                                         // since C++20
-  template<input_or_output_iterator I, sentinel_for<I> S>
-    constexpr iter_difference_t<I> advance(I& i, iter_difference_t<I> n, S bound); // since C++20
-}
-
-template <class Iterator>
-class reverse_iterator
-    : public iterator<typename iterator_traits<Iterator>::iterator_category, // until C++17
-                      typename iterator_traits<Iterator>::value_type,
-                      typename iterator_traits<Iterator>::difference_type,
-                      typename iterator_traits<Iterator>::pointer,
-                      typename iterator_traits<Iterator>::reference>
-{
-protected:
-    Iterator current;
-public:
-    using iterator_type     = Iterator;
-    using iterator_concept  = see below; // since C++20
-    using iterator_category = typename iterator_traits<Iterator>::iterator_category; // since C++17, until C++20
-    using iterator_category = see below; // since C++20
-    using value_type        = typename iterator_traits<Iterator>::value_type; // since C++17, until C++20
-    using value_type        = iter_value_t<Iterator>; // since C++20
-    using difference_type   = typename iterator_traits<Iterator>::difference_type; // until C++20
-    using difference_type   = iter_difference_t<Iterator>; // since C++20
-    using pointer           = typename iterator_traits<Iterator>::pointer;
-    using reference         = typename iterator_traits<Iterator>::reference; // until C++20
-    using reference         = iter_reference_t<Iterator>; // since C++20
-
-    constexpr reverse_iterator();
-    constexpr explicit reverse_iterator(Iterator x);
-    template <class U> constexpr reverse_iterator(const reverse_iterator<U>& u);
-    template <class U> constexpr reverse_iterator& operator=(const reverse_iterator<U>& u);
-    constexpr Iterator base() const;
-    constexpr reference operator*() const;
-    constexpr pointer   operator->() const; // until C++20
-    constexpr pointer   operator->() const requires see below; // since C++20
-    constexpr reverse_iterator& operator++();
-    constexpr reverse_iterator  operator++(int);
-    constexpr reverse_iterator& operator--();
-    constexpr reverse_iterator  operator--(int);
-    constexpr reverse_iterator  operator+ (difference_type n) const;
-    constexpr reverse_iterator& operator+=(difference_type n);
-    constexpr reverse_iterator  operator- (difference_type n) const;
-    constexpr reverse_iterator& operator-=(difference_type n);
-    constexpr unspecified       operator[](difference_type n) const;
-
-    friend constexpr iter_rvalue_reference_t<Iterator>
-      iter_move(const reverse_iterator& i) noexcept(see below);
-    template<indirectly_swappable<Iterator> Iterator2>
-      friend constexpr void
-        iter_swap(const reverse_iterator& x,
-                  const reverse_iterator<Iterator2>& y) noexcept(see below);
-};
-
-template <class Iterator1, class Iterator2>
-constexpr bool                          // constexpr in C++17
-operator==(const reverse_iterator<Iterator1>& x, const reverse_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool                          // constexpr in C++17
-operator!=(const reverse_iterator<Iterator1>& x, const reverse_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool                          // constexpr in C++17
-operator<(const reverse_iterator<Iterator1>& x, const reverse_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool                          // constexpr in C++17
-operator>(const reverse_iterator<Iterator1>& x, const reverse_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool                          // constexpr in C++17
-operator<=(const reverse_iterator<Iterator1>& x, const reverse_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool                          // constexpr in C++17
-operator>=(const reverse_iterator<Iterator1>& x, const reverse_iterator<Iterator2>& y);
-
-template<class Iterator1, three_way_comparable_with<Iterator1> Iterator2>
-  constexpr compare_three_way_result_t<Iterator1, Iterator2>
-    operator<=>(const reverse_iterator<Iterator1>& x,
-                const reverse_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr auto
-operator-(const reverse_iterator<Iterator1>& x, const reverse_iterator<Iterator2>& y)
--> decltype(__y.base() - __x.base());   // constexpr in C++17
-
-template <class Iterator>
-constexpr reverse_iterator<Iterator>
-operator+(typename reverse_iterator<Iterator>::difference_type n,
-          const reverse_iterator<Iterator>& x);   // constexpr in C++17
-
-template <class Iterator>
-constexpr reverse_iterator<Iterator> make_reverse_iterator(Iterator i); // C++14, constexpr in C++17
-
-template<class Iterator1, class Iterator2>
-    requires (!sized_sentinel_for<Iterator1, Iterator2>)
-  inline constexpr bool disable_sized_sentinel_for<reverse_iterator<Iterator1>,
-                                                   reverse_iterator<Iterator2>> = true;
-
-template <class Container>
-class back_insert_iterator
-    : public iterator<output_iterator_tag, void, void, void, void> // until C++17
-{
-protected:
-    Container* container;
-public:
-    typedef Container                   container_type;
-    typedef void                        value_type;
-    typedef void                        difference_type; // until C++20
-    typedef ptrdiff_t                   difference_type; // since C++20
-    typedef void                        reference;
-    typedef void                        pointer;
-
-    explicit back_insert_iterator(Container& x);  // constexpr in C++20
-    back_insert_iterator& operator=(const typename Container::value_type& value);  // constexpr in C++20
-    back_insert_iterator& operator*();  // constexpr in C++20
-    back_insert_iterator& operator++();  // constexpr in C++20
-    back_insert_iterator  operator++(int);  // constexpr in C++20
-};
-
-template <class Container> back_insert_iterator<Container> back_inserter(Container& x);  // constexpr in C++20
-
-template <class Container>
-class front_insert_iterator
-    : public iterator<output_iterator_tag, void, void, void, void> // until C++17
-{
-protected:
-    Container* container;
-public:
-    typedef Container                    container_type;
-    typedef void                         value_type;
-    typedef void                         difference_type; // until C++20
-    typedef ptrdiff_t                    difference_type; // since C++20
-    typedef void                         reference;
-    typedef void                         pointer;
-
-    explicit front_insert_iterator(Container& x);  // constexpr in C++20
-    front_insert_iterator& operator=(const typename Container::value_type& value);  // constexpr in C++20
-    front_insert_iterator& operator*();  // constexpr in C++20
-    front_insert_iterator& operator++();  // constexpr in C++20
-    front_insert_iterator  operator++(int);  // constexpr in C++20
-};
-
-template <class Container> front_insert_iterator<Container> front_inserter(Container& x);  // constexpr in C++20
-
-template <class Container>
-class insert_iterator
-    : public iterator<output_iterator_tag, void, void, void, void> // until C++17
-{
-protected:
-    Container* container;
-    typename Container::iterator iter;
-public:
-    typedef Container              container_type;
-    typedef void                   value_type;
-    typedef void                   difference_type; // until C++20
-    typedef ptrdiff_t              difference_type; // since C++20
-    typedef void                   reference;
-    typedef void                   pointer;
-
-    insert_iterator(Container& x, typename Container::iterator i);  // constexpr in C++20
-    insert_iterator& operator=(const typename Container::value_type& value);  // constexpr in C++20
-    insert_iterator& operator*();  // constexpr in C++20
-    insert_iterator& operator++();  // constexpr in C++20
-    insert_iterator& operator++(int);  // constexpr in C++20
-};
-
-template <class Container>
-insert_iterator<Container> inserter(Container& x, typename Container::iterator i);  // until C++20
-template <class Container>
-constexpr insert_iterator<Container> inserter(Container& x, ranges::iterator_t<Container> i);  // since C++20
-
-template <class Iterator>
-class move_iterator {
-public:
-    using iterator_type     = Iterator;
-    using iterator_concept  = input_iterator_tag; // From C++20
-    using iterator_category = see below; // not always present starting from C++20
-    using value_type        = iter_value_t<Iterator>; // Until C++20, iterator_traits<Iterator>::value_type
-    using difference_type   = iter_difference_t<Iterator>; // Until C++20, iterator_traits<Iterator>::difference_type;
-    using pointer           = Iterator;
-    using reference         = iter_rvalue_reference_t<Iterator>; // Until C++20, value_type&&
-
-    constexpr move_iterator();  // all the constexprs are in C++17
-    constexpr explicit move_iterator(Iterator i);
-    template <class U>
-      constexpr move_iterator(const move_iterator<U>& u);
-    template <class U>
-      constexpr move_iterator& operator=(const move_iterator<U>& u);
-
-    constexpr iterator_type base() const; // Until C++20
-    constexpr const Iterator& base() const & noexcept; // From C++20
-    constexpr Iterator base() &&; // From C++20
-
-    constexpr reference operator*() const;
-    constexpr pointer operator->() const; // Deprecated in C++20
-    constexpr move_iterator& operator++();
-    constexpr auto operator++(int); // Return type was move_iterator until C++20
-    constexpr move_iterator& operator--();
-    constexpr move_iterator operator--(int);
-    constexpr move_iterator operator+(difference_type n) const;
-    constexpr move_iterator& operator+=(difference_type n);
-    constexpr move_iterator operator-(difference_type n) const;
-    constexpr move_iterator& operator-=(difference_type n);
-    constexpr reference operator[](difference_type n) const; // Return type unspecified until C++20
-
-    template<sentinel_for<Iterator> S>
-      friend constexpr bool
-        operator==(const move_iterator& x, const move_sentinel<S>& y); // Since C++20
-    template<sized_sentinel_for<Iterator> S>
-      friend constexpr iter_difference_t<Iterator>
-        operator-(const move_sentinel<S>& x, const move_iterator& y); // Since C++20
-    template<sized_sentinel_for<Iterator> S>
-      friend constexpr iter_difference_t<Iterator>
-        operator-(const move_iterator& x, const move_sentinel<S>& y); // Since C++20
-    friend constexpr iter_rvalue_reference_t<Iterator>
-      iter_move(const move_iterator& i)
-        noexcept(noexcept(ranges::iter_move(i.current))); // Since C++20
-    template<indirectly_swappable<Iterator> Iterator2>
-      friend constexpr void
-        iter_swap(const move_iterator& x, const move_iterator<Iterator2>& y)
-          noexcept(noexcept(ranges::iter_swap(x.current, y.current))); // Since C++20
-
-private:
-    Iterator current; // exposition only
-};
-
-template <class Iterator1, class Iterator2>
-constexpr bool   // constexpr in C++17
-operator==(const move_iterator<Iterator1>& x, const move_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool   // constexpr in C++17
-operator!=(const move_iterator<Iterator1>& x, const move_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool   // constexpr in C++17
-operator<(const move_iterator<Iterator1>& x, const move_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool   // constexpr in C++17
-operator<=(const move_iterator<Iterator1>& x, const move_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool   // constexpr in C++17
-operator>(const move_iterator<Iterator1>& x, const move_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr bool   // constexpr in C++17
-operator>=(const move_iterator<Iterator1>& x, const move_iterator<Iterator2>& y);
-
-template <class Iterator1, class Iterator2>
-constexpr auto   // constexpr in C++17
-operator-(const move_iterator<Iterator1>& x,
-          const move_iterator<Iterator2>& y) -> decltype(x.base() - y.base());
-
-template <class Iterator>
-constexpr move_iterator<Iterator> operator+(   // constexpr in C++17
-            typename move_iterator<Iterator>::difference_type n,
-            const move_iterator<Iterator>& x);
-
-template <class Iterator>   // constexpr in C++17
-constexpr  move_iterator<Iterator> make_move_iterator(const Iterator& i);
-
-template<semiregular S>
-class move_sentinel {
-public:
-  constexpr move_sentinel();
-  constexpr explicit move_sentinel(S s);
-  template<class S2>
-    requires convertible_to<const S2&, S>
-      constexpr move_sentinel(const move_sentinel<S2>& s);
-  template<class S2>
-    requires assignable_from<S&, const S2&>
-      constexpr move_sentinel& operator=(const move_sentinel<S2>& s);
-
-  constexpr S base() const;
-private:
-  S last;     // exposition only
-};
-
-// [default.sentinel], default sentinel
-struct default_sentinel_t;
-inline constexpr default_sentinel_t default_sentinel{};
-
-// [iterators.counted], counted iterators
-template<input_or_output_iterator I> class counted_iterator;
-
-template<input_iterator I>
-  requires see below
-  struct iterator_traits<counted_iterator<I>>;
-
-// [unreachable.sentinel], unreachable sentinel
-struct unreachable_sentinel_t;
-inline constexpr unreachable_sentinel_t unreachable_sentinel{};
-
-template <class T, class charT = char, class traits = char_traits<charT>, class Distance = ptrdiff_t>
-class istream_iterator
-    : public iterator<input_iterator_tag, T, Distance, const T*, const T&> // until C++17
-{
-public:
-    typedef input_iterator_tag           iterator_category;
-    typedef T                            value_type;
-    typedef Distance                     difference_type;
-    typedef const T*                     pointer;
-    typedef const T&                     reference;
-
-    typedef charT                        char_type;
-    typedef traits                       traits_type;
-    typedef basic_istream<charT, traits> istream_type;
-
-    istream_iterator(); // constexpr since C++11
-    constexpr istream_iterator(default_sentinel_t); // since C++20
-    istream_iterator(istream_type& s);
-    istream_iterator(const istream_iterator& x);
-    ~istream_iterator();
-
-    const T& operator*() const;
-    const T* operator->() const;
-    istream_iterator& operator++();
-    istream_iterator  operator++(int);
-    friend bool operator==(const istream_iterator& i, default_sentinel_t); // since C++20
-};
-
-template <class T, class charT, class traits, class Distance>
-bool operator==(const istream_iterator<T,charT,traits,Distance>& x,
-                const istream_iterator<T,charT,traits,Distance>& y);
-template <class T, class charT, class traits, class Distance>
-bool operator!=(const istream_iterator<T,charT,traits,Distance>& x,
-                const istream_iterator<T,charT,traits,Distance>& y); // until C++20
-
-template <class T, class charT = char, class traits = char_traits<charT> >
-class ostream_iterator
-    : public iterator<output_iterator_tag, void, void, void, void> // until C++17
-{
-public:
-    typedef output_iterator_tag         iterator_category;
-    typedef void                        value_type;
-    typedef void                        difference_type; // until C++20
-    typedef ptrdiff_t                   difference_type; // since C++20
-    typedef void                        pointer;
-    typedef void                        reference;
-
-    typedef charT char_type;
-    typedef traits traits_type;
-    typedef basic_ostream<charT,traits> ostream_type;
-
-    ostream_iterator(ostream_type& s);
-    ostream_iterator(ostream_type& s, const charT* delimiter);
-    ostream_iterator(const ostream_iterator& x);
-    ~ostream_iterator();
-    ostream_iterator& operator=(const T& value);
-
-    ostream_iterator& operator*();
-    ostream_iterator& operator++();
-    ostream_iterator& operator++(int);
-};
-
-template<class charT, class traits = char_traits<charT> >
-class istreambuf_iterator
-    : public iterator<input_iterator_tag, charT, traits::off_type, unspecified, charT> // until C++17
-{
-public:
-    typedef input_iterator_tag              iterator_category;
-    typedef charT                           value_type;
-    typedef traits::off_type                difference_type;
-    typedef unspecified                     pointer;
-    typedef charT                           reference;
-
-    typedef charT                           char_type;
-    typedef traits                          traits_type;
-    typedef traits::int_type                int_type;
-    typedef basic_streambuf<charT, traits>  streambuf_type;
-    typedef basic_istream<charT, traits>    istream_type;
-
-    istreambuf_iterator() noexcept; // constexpr since C++11
-    constexpr istreambuf_iterator(default_sentinel_t) noexcept; // since C++20
-    istreambuf_iterator(istream_type& s) noexcept;
-    istreambuf_iterator(streambuf_type* s) noexcept;
-    istreambuf_iterator(a-private-type) noexcept;
-
-    charT                operator*() const;
-    pointer operator->() const;
-    istreambuf_iterator& operator++();
-    a-private-type       operator++(int);
-
-    bool equal(const istreambuf_iterator& b) const;
-    friend bool operator==(const istreambuf_iterator& i, default_sentinel_t s); // since C++20
-};
-
-template <class charT, class traits>
-bool operator==(const istreambuf_iterator<charT,traits>& a,
-                const istreambuf_iterator<charT,traits>& b);
-template <class charT, class traits>
-bool operator!=(const istreambuf_iterator<charT,traits>& a,
-                const istreambuf_iterator<charT,traits>& b); // until C++20
-
-template <class charT, class traits = char_traits<charT> >
-class ostreambuf_iterator
-    : public iterator<output_iterator_tag, void, void, void, void> // until C++17
-{
-public:
-    typedef output_iterator_tag            iterator_category;
-    typedef void                           value_type;
-    typedef void                           difference_type; // until C++20
-    typedef ptrdiff_t                      difference_type; // since C++20
-    typedef void                           pointer;
-    typedef void                           reference;
-
-    typedef charT                          char_type;
-    typedef traits                         traits_type;
-    typedef basic_streambuf<charT, traits> streambuf_type;
-    typedef basic_ostream<charT, traits>   ostream_type;
-
-    ostreambuf_iterator(ostream_type& s) noexcept;
-    ostreambuf_iterator(streambuf_type* s) noexcept;
-    ostreambuf_iterator& operator=(charT c);
-    ostreambuf_iterator& operator*();
-    ostreambuf_iterator& operator++();
-    ostreambuf_iterator& operator++(int);
-    bool failed() const noexcept;
-};
-
-template <class C> constexpr auto begin(C& c) -> decltype(c.begin());
-template <class C> constexpr auto begin(const C& c) -> decltype(c.begin());
-template <class C> constexpr auto end(C& c) -> decltype(c.end());
-template <class C> constexpr auto end(const C& c) -> decltype(c.end());
-template <class T, size_t N> constexpr T* begin(T (&array)[N]);
-template <class T, size_t N> constexpr T* end(T (&array)[N]);
-
-template <class C> auto constexpr cbegin(const C& c) -> decltype(std::begin(c));        // C++14
-template <class C> auto constexpr cend(const C& c) -> decltype(std::end(c));            // C++14
-template <class C> auto constexpr rbegin(C& c) -> decltype(c.rbegin());                 // C++14
-template <class C> auto constexpr rbegin(const C& c) -> decltype(c.rbegin());           // C++14
-template <class C> auto constexpr rend(C& c) -> decltype(c.rend());                     // C++14
-template <class C> constexpr auto rend(const C& c) -> decltype(c.rend());               // C++14
-template <class E> reverse_iterator<const E*> constexpr rbegin(initializer_list<E> il); // C++14
-template <class E> reverse_iterator<const E*> constexpr rend(initializer_list<E> il);   // C++14
-template <class T, size_t N> reverse_iterator<T*> constexpr rbegin(T (&array)[N]);      // C++14
-template <class T, size_t N> reverse_iterator<T*> constexpr rend(T (&array)[N]);        // C++14
-template <class C> constexpr auto crbegin(const C& c) -> decltype(std::rbegin(c));      // C++14
-template <class C> constexpr auto crend(const C& c) -> decltype(std::rend(c));          // C++14
-
-// 24.8, container access:
-template <class C> constexpr auto size(const C& c) -> decltype(c.size());         // C++17
-template <class T, size_t N> constexpr size_t size(const T (&array)[N]) noexcept; // C++17
-
-template <class C> constexpr auto ssize(const C& c)
-    -> common_type_t<ptrdiff_t, make_signed_t<decltype(c.size())>>;                    // C++20
-template <class T, ptrdiff_t> constexpr ptrdiff_t ssize(const T (&array)[N]) noexcept; // C++20
-
-template <class C> constexpr auto empty(const C& c) -> decltype(c.empty());       // C++17
-template <class T, size_t N> constexpr bool empty(const T (&array)[N]) noexcept;  // C++17
-template <class E> constexpr bool empty(initializer_list<E> il) noexcept;         // C++17
-template <class C> constexpr auto data(C& c) -> decltype(c.data());               // C++17
-template <class C> constexpr auto data(const C& c) -> decltype(c.data());         // C++17
-template <class T, size_t N> constexpr T* data(T (&array)[N]) noexcept;           // C++17
-template <class E> constexpr const E* data(initializer_list<E> il) noexcept;      // C++17
-
-}  // std
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__iterator/access.h>
-#include <cuda/std/__iterator/advance.h>
-#include <cuda/std/__iterator/back_insert_iterator.h>
-#include <cuda/std/__iterator/bounded_iter.h>
-#include <cuda/std/__iterator/concepts.h>
-#include <cuda/std/__iterator/data.h>
-#include <cuda/std/__iterator/default_sentinel.h>
-#include <cuda/std/__iterator/distance.h>
-#include <cuda/std/__iterator/empty.h>
-#include <cuda/std/__iterator/erase_if_container.h>
-#include <cuda/std/__iterator/front_insert_iterator.h>
-#include <cuda/std/__iterator/incrementable_traits.h>
-#include <cuda/std/__iterator/indirectly_comparable.h>
-#include <cuda/std/__iterator/insert_iterator.h>
-#include <cuda/std/__iterator/istream_iterator.h>
-#include <cuda/std/__iterator/istreambuf_iterator.h>
-#include <cuda/std/__iterator/iter_move.h>
-#include <cuda/std/__iterator/iter_swap.h>
-#include <cuda/std/__iterator/iterator.h>
-#include <cuda/std/__iterator/iterator_traits.h>
-#include <cuda/std/__iterator/mergeable.h>
-#include <cuda/std/__iterator/move_iterator.h>
-#include <cuda/std/__iterator/move_sentinel.h>
-#include <cuda/std/__iterator/next.h>
-#include <cuda/std/__iterator/ostream_iterator.h>
-#include <cuda/std/__iterator/ostreambuf_iterator.h>
-#include <cuda/std/__iterator/permutable.h>
-#include <cuda/std/__iterator/prev.h>
-#include <cuda/std/__iterator/projected.h>
-#include <cuda/std/__iterator/readable_traits.h>
-#include <cuda/std/__iterator/reverse_access.h>
-#include <cuda/std/__iterator/reverse_iterator.h>
-#include <cuda/std/__iterator/size.h>
-#include <cuda/std/__iterator/sortable.h>
-#include <cuda/std/__iterator/unreachable_sentinel.h>
-#include <cuda/std/__iterator/wrap_iter.h>
-#include <cuda/std/__memory/addressof.h>
-#include <cuda/std/__memory/pointer_traits.h>
-#include <cuda/std/__type_traits/is_pointer.h>
-#include <cuda/std/cstddef>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/detail/libcxx/include/iosfwd> // for forward declarations of vector and string.
-#include <cuda/std/initializer_list>
-#include <cuda/std/type_traits>
-#include <cuda/std/version>
-
-// Mandated by the Standard.
-#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-#  include <cuda/std/detail/libcxx/include/compare>
-#endif // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-#include <cuda/std/concepts>
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <class _Iter>
-struct __libcpp_is_trivial_iterator : public bool_constant<is_pointer<_Iter>::value>
-{};
-
-template <class _Iter>
-struct __libcpp_is_trivial_iterator<move_iterator<_Iter>>
-    : public bool_constant<__libcpp_is_trivial_iterator<_Iter>::value>
-{};
-
-template <class _Iter>
-struct __libcpp_is_trivial_iterator<reverse_iterator<_Iter>>
-    : public bool_constant<__libcpp_is_trivial_iterator<_Iter>::value>
-{};
-
-template <class _Iter>
-struct __libcpp_is_trivial_iterator<__wrap_iter<_Iter>>
-    : public bool_constant<__libcpp_is_trivial_iterator<_Iter>::value>
-{};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#endif // _LIBCUDACXX_ITERATOR
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/latch b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
index 26442e8283..e74c77cc1a 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/latch
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
@@ -74,15 +74,15 @@ class __latch_base
   _LIBCUDACXX_LATCH_ALIGNMENT __atomic_impl<ptrdiff_t, _Sco> __counter;
 
 public:
-  inline _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __latch_base(ptrdiff_t __expected)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __latch_base(ptrdiff_t __expected)
       : __counter(__expected)
   {}
 
-  ~__latch_base()                              = default;
+  _CCCL_HIDE_FROM_ABI ~__latch_base()          = default;
   __latch_base(const __latch_base&)            = delete;
   __latch_base& operator=(const __latch_base&) = delete;
 
-  inline _LIBCUDACXX_INLINE_VISIBILITY void count_down(ptrdiff_t __update = 1)
+  _LIBCUDACXX_HIDE_FROM_ABI void count_down(ptrdiff_t __update = 1)
   {
     _LIBCUDACXX_ASSERT(__update > 0, "");
     auto const __old = __counter.fetch_sub(__update, memory_order_release);
@@ -92,11 +92,11 @@ public:
       __counter.notify_all();
     }
   }
-  inline _LIBCUDACXX_INLINE_VISIBILITY bool try_wait() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI bool try_wait() const noexcept
   {
     return __counter.load(memory_order_acquire) == 0;
   }
-  inline _LIBCUDACXX_INLINE_VISIBILITY void wait() const
+  _LIBCUDACXX_HIDE_FROM_ABI void wait() const
   {
     while (1)
     {
@@ -108,13 +108,13 @@ public:
       __counter.wait(__current, memory_order_relaxed);
     }
   }
-  inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait(ptrdiff_t __update = 1)
+  _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait(ptrdiff_t __update = 1)
   {
     count_down(__update);
     wait();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
   {
     return numeric_limits<ptrdiff_t>::max();
   }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/limits b/libcudacxx/include/cuda/std/detail/libcxx/include/limits
index 228c2818b7..1d6a409aee 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/limits
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/limits
@@ -123,10 +123,6 @@ _CCCL_PUSH_MACROS
 #  include <cuda/std/detail/libcxx/include/support/win32/limits_msvc_win32.h>
 #endif // _LIBCUDACXX_MSVCRT
 
-#if defined(_CCCL_COMPILER_IBM)
-#  include <cuda/std/detail/libcxx/include/support/ibm/limits.h>
-#endif // _CCCL_COMPILER_IBM
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 enum float_round_style
@@ -152,15 +148,15 @@ protected:
   typedef _Tp type;
 
   static constexpr bool is_specialized = false;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return type();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return type();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return type();
   }
@@ -172,11 +168,11 @@ protected:
   static constexpr bool is_integer  = false;
   static constexpr bool is_exact    = false;
   static constexpr int radix        = 0;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return type();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return type();
   }
@@ -191,19 +187,19 @@ protected:
   static constexpr bool has_signaling_NaN        = false;
   static constexpr float_denorm_style has_denorm = denorm_absent;
   static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return type();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return type();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return type();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return type();
   }
@@ -246,15 +242,15 @@ protected:
   static constexpr int max_digits10 = 0;
   static constexpr type __min       = __libcpp_compute_min<type, digits, is_signed>::value;
   static constexpr type __max       = is_signed ? type(type(~0) ^ __min) : type(~0);
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return __min;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return __max;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return min();
   }
@@ -262,11 +258,11 @@ protected:
   static constexpr bool is_integer = true;
   static constexpr bool is_exact   = true;
   static constexpr int radix       = 2;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return type(0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return type(0);
   }
@@ -281,19 +277,19 @@ protected:
   static constexpr bool has_signaling_NaN        = false;
   static constexpr float_denorm_style has_denorm = denorm_absent;
   static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return type(0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return type(0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return type(0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return type(0);
   }
@@ -325,15 +321,15 @@ protected:
   static constexpr int max_digits10 = 0;
   static constexpr type __min       = false;
   static constexpr type __max       = true;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return __min;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return __max;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return min();
   }
@@ -341,11 +337,11 @@ protected:
   static constexpr bool is_integer = true;
   static constexpr bool is_exact   = true;
   static constexpr int radix       = 2;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return type(0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return type(0);
   }
@@ -360,19 +356,19 @@ protected:
   static constexpr bool has_signaling_NaN        = false;
   static constexpr float_denorm_style has_denorm = denorm_absent;
   static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return type(0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return type(0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return type(0);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return type(0);
   }
@@ -398,15 +394,15 @@ protected:
   static constexpr int digits       = __FLT_MANT_DIG__;
   static constexpr int digits10     = __FLT_DIG__;
   static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return __FLT_MIN__;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return __FLT_MAX__;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return -max();
   }
@@ -414,11 +410,11 @@ protected:
   static constexpr bool is_integer = false;
   static constexpr bool is_exact   = false;
   static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return __FLT_EPSILON__;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return 0.5F;
   }
@@ -434,33 +430,33 @@ protected:
   static constexpr float_denorm_style has_denorm = denorm_present;
   static constexpr bool has_denorm_loss          = false;
 #ifdef _CCCL_COMPILER_NVRTC
-  _LIBCUDACXX_INLINE_VISIBILITY static type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static type infinity() noexcept
   {
     return __builtin_huge_valf();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static type quiet_NaN() noexcept
   {
     return __builtin_nanf("");
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static type signaling_NaN() noexcept
   {
     return __builtin_nansf("");
   }
 #else
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return __builtin_huge_valf();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return __builtin_nanf("");
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return __builtin_nansf("");
   }
 #endif
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return __FLT_DENORM_MIN__;
   }
@@ -486,15 +482,15 @@ protected:
   static constexpr int digits       = __DBL_MANT_DIG__;
   static constexpr int digits10     = __DBL_DIG__;
   static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return __DBL_MIN__;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return __DBL_MAX__;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return -max();
   }
@@ -502,11 +498,11 @@ protected:
   static constexpr bool is_integer = false;
   static constexpr bool is_exact   = false;
   static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return __DBL_EPSILON__;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return 0.5;
   }
@@ -522,33 +518,33 @@ protected:
   static constexpr float_denorm_style has_denorm = denorm_present;
   static constexpr bool has_denorm_loss          = false;
 #ifdef _CCCL_COMPILER_NVRTC
-  _LIBCUDACXX_INLINE_VISIBILITY static type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static type infinity() noexcept
   {
     return __builtin_huge_val();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static type quiet_NaN() noexcept
   {
     return __builtin_nan("");
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static type signaling_NaN() noexcept
   {
     return __builtin_nans("");
   }
 #else
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return __builtin_huge_val();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return __builtin_nan("");
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return __builtin_nans("");
   }
 #endif
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return __DBL_DENORM_MIN__;
   }
@@ -576,15 +572,15 @@ protected:
   static constexpr int digits       = __LDBL_MANT_DIG__;
   static constexpr int digits10     = __LDBL_DIG__;
   static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return __LDBL_MIN__;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return __LDBL_MAX__;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return -max();
   }
@@ -592,11 +588,11 @@ protected:
   static constexpr bool is_integer = false;
   static constexpr bool is_exact   = false;
   static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return __LDBL_EPSILON__;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return 0.5L;
   }
@@ -611,28 +607,24 @@ protected:
   static constexpr bool has_signaling_NaN        = true;
   static constexpr float_denorm_style has_denorm = denorm_present;
   static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return __builtin_huge_vall();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return __builtin_nanl("");
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return __builtin_nansl("");
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return __LDBL_DENORM_MIN__;
   }
 
-#  if (defined(__ppc__) || defined(__ppc64__) || defined(__PPC64__))
-  static constexpr bool is_iec559 = false;
-#  else
-  static constexpr bool is_iec559 = true;
-#  endif
+  static constexpr bool is_iec559  = true;
   static constexpr bool is_bounded = true;
   static constexpr bool is_modulo  = false;
 
@@ -643,22 +635,22 @@ protected:
 };
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<__remove_cv_t<_Tp>>
+class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits : private __libcpp_numeric_limits<__remove_cv_t<_Tp>>
 {
   typedef __libcpp_numeric_limits<__remove_cv_t<_Tp>> __base;
   typedef typename __base::type type;
 
 public:
   static constexpr bool is_specialized = __base::is_specialized;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return __base::min();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return __base::max();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return __base::lowest();
   }
@@ -670,11 +662,11 @@ public:
   static constexpr bool is_integer  = __base::is_integer;
   static constexpr bool is_exact    = __base::is_exact;
   static constexpr int radix        = __base::radix;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return __base::epsilon();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return __base::round_error();
   }
@@ -689,19 +681,19 @@ public:
   static constexpr bool has_signaling_NaN        = __base::has_signaling_NaN;
   static constexpr float_denorm_style has_denorm = __base::has_denorm;
   static constexpr bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return __base::infinity();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return __base::quiet_NaN();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return __base::signaling_NaN();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return __base::denorm_min();
   }
@@ -763,22 +755,22 @@ template <class _Tp>
 constexpr float_round_style numeric_limits<_Tp>::round_style;
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS numeric_limits<const _Tp> : private numeric_limits<_Tp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits<const _Tp> : private numeric_limits<_Tp>
 {
   typedef numeric_limits<_Tp> __base;
   typedef _Tp type;
 
 public:
   static constexpr bool is_specialized = __base::is_specialized;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return __base::min();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return __base::max();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return __base::lowest();
   }
@@ -790,11 +782,11 @@ public:
   static constexpr bool is_integer  = __base::is_integer;
   static constexpr bool is_exact    = __base::is_exact;
   static constexpr int radix        = __base::radix;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return __base::epsilon();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return __base::round_error();
   }
@@ -809,19 +801,19 @@ public:
   static constexpr bool has_signaling_NaN        = __base::has_signaling_NaN;
   static constexpr float_denorm_style has_denorm = __base::has_denorm;
   static constexpr bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return __base::infinity();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return __base::quiet_NaN();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return __base::signaling_NaN();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return __base::denorm_min();
   }
@@ -883,22 +875,22 @@ template <class _Tp>
 constexpr float_round_style numeric_limits<const _Tp>::round_style;
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS numeric_limits<volatile _Tp> : private numeric_limits<_Tp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits<volatile _Tp> : private numeric_limits<_Tp>
 {
   typedef numeric_limits<_Tp> __base;
   typedef _Tp type;
 
 public:
   static constexpr bool is_specialized = __base::is_specialized;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return __base::min();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return __base::max();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return __base::lowest();
   }
@@ -910,11 +902,11 @@ public:
   static constexpr bool is_integer  = __base::is_integer;
   static constexpr bool is_exact    = __base::is_exact;
   static constexpr int radix        = __base::radix;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return __base::epsilon();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return __base::round_error();
   }
@@ -929,19 +921,19 @@ public:
   static constexpr bool has_signaling_NaN        = __base::has_signaling_NaN;
   static constexpr float_denorm_style has_denorm = __base::has_denorm;
   static constexpr bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return __base::infinity();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return __base::quiet_NaN();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return __base::signaling_NaN();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return __base::denorm_min();
   }
@@ -1003,22 +995,22 @@ template <class _Tp>
 constexpr float_round_style numeric_limits<volatile _Tp>::round_style;
 
 template <class _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS numeric_limits<const volatile _Tp> : private numeric_limits<_Tp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits<const volatile _Tp> : private numeric_limits<_Tp>
 {
   typedef numeric_limits<_Tp> __base;
   typedef _Tp type;
 
 public:
   static constexpr bool is_specialized = __base::is_specialized;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
     return __base::min();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
   {
     return __base::max();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
   {
     return __base::lowest();
   }
@@ -1030,11 +1022,11 @@ public:
   static constexpr bool is_integer  = __base::is_integer;
   static constexpr bool is_exact    = __base::is_exact;
   static constexpr int radix        = __base::radix;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
   {
     return __base::epsilon();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
   {
     return __base::round_error();
   }
@@ -1049,19 +1041,19 @@ public:
   static constexpr bool has_signaling_NaN        = __base::has_signaling_NaN;
   static constexpr float_denorm_style has_denorm = __base::has_denorm;
   static constexpr bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
   {
     return __base::infinity();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
   {
     return __base::quiet_NaN();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
   {
     return __base::signaling_NaN();
   }
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
   {
     return __base::denorm_min();
   }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/math.h b/libcudacxx/include/cuda/std/detail/libcxx/include/math.h
deleted file mode 100644
index 46fead1dc7..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/math.h
+++ /dev/null
@@ -1,1878 +0,0 @@
-// -*- C++ -*-
-//===---------------------------- math.h ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_MATH_H
-#  define _LIBCUDACXX_MATH_H
-
-/*
-    math.h synopsis
-
-Macros:
-
-    HUGE_VAL
-    HUGE_VALF               // C99
-    HUGE_VALL               // C99
-    INFINITY                // C99
-    NAN                     // C99
-    FP_INFINITE             // C99
-    FP_NAN                  // C99
-    FP_NORMAL               // C99
-    FP_SUBNORMAL            // C99
-    FP_ZERO                 // C99
-    FP_FAST_FMA             // C99
-    FP_FAST_FMAF            // C99
-    FP_FAST_FMAL            // C99
-    FP_ILOGB0               // C99
-    FP_ILOGBNAN             // C99
-    MATH_ERRNO              // C99
-    MATH_ERREXCEPT          // C99
-    math_errhandling        // C99
-
-Types:
-
-    float_t                 // C99
-    double_t                // C99
-
-// C90
-
-floating_point abs(floating_point x);
-
-floating_point acos (arithmetic x);
-float          acosf(float x);
-long double    acosl(long double x);
-
-floating_point asin (arithmetic x);
-float          asinf(float x);
-long double    asinl(long double x);
-
-floating_point atan (arithmetic x);
-float          atanf(float x);
-long double    atanl(long double x);
-
-floating_point atan2 (arithmetic y, arithmetic x);
-float          atan2f(float y, float x);
-long double    atan2l(long double y, long double x);
-
-floating_point ceil (arithmetic x);
-float          ceilf(float x);
-long double    ceill(long double x);
-
-floating_point cos (arithmetic x);
-float          cosf(float x);
-long double    cosl(long double x);
-
-floating_point cosh (arithmetic x);
-float          coshf(float x);
-long double    coshl(long double x);
-
-floating_point exp (arithmetic x);
-float          expf(float x);
-long double    expl(long double x);
-
-floating_point fabs (arithmetic x);
-float          fabsf(float x);
-long double    fabsl(long double x);
-
-floating_point floor (arithmetic x);
-float          floorf(float x);
-long double    floorl(long double x);
-
-floating_point fmod (arithmetic x, arithmetic y);
-float          fmodf(float x, float y);
-long double    fmodl(long double x, long double y);
-
-floating_point frexp (arithmetic value, int* exp);
-float          frexpf(float value, int* exp);
-long double    frexpl(long double value, int* exp);
-
-floating_point ldexp (arithmetic value, int exp);
-float          ldexpf(float value, int exp);
-long double    ldexpl(long double value, int exp);
-
-floating_point log (arithmetic x);
-float          logf(float x);
-long double    logl(long double x);
-
-floating_point log10 (arithmetic x);
-float          log10f(float x);
-long double    log10l(long double x);
-
-floating_point modf (floating_point value, floating_point* iptr);
-float          modff(float value, float* iptr);
-long double    modfl(long double value, long double* iptr);
-
-floating_point pow (arithmetic x, arithmetic y);
-float          powf(float x, float y);
-long double    powl(long double x, long double y);
-
-floating_point sin (arithmetic x);
-float          sinf(float x);
-long double    sinl(long double x);
-
-floating_point sinh (arithmetic x);
-float          sinhf(float x);
-long double    sinhl(long double x);
-
-floating_point sqrt (arithmetic x);
-float          sqrtf(float x);
-long double    sqrtl(long double x);
-
-floating_point tan (arithmetic x);
-float          tanf(float x);
-long double    tanl(long double x);
-
-floating_point tanh (arithmetic x);
-float          tanhf(float x);
-long double    tanhl(long double x);
-
-//  C99
-
-bool signbit(arithmetic x);
-
-int fpclassify(arithmetic x);
-
-bool isfinite(arithmetic x);
-bool isinf(arithmetic x);
-bool isnan(arithmetic x);
-bool isnormal(arithmetic x);
-
-bool isgreater(arithmetic x, arithmetic y);
-bool isgreaterequal(arithmetic x, arithmetic y);
-bool isless(arithmetic x, arithmetic y);
-bool islessequal(arithmetic x, arithmetic y);
-bool islessgreater(arithmetic x, arithmetic y);
-bool isunordered(arithmetic x, arithmetic y);
-
-floating_point acosh (arithmetic x);
-float          acoshf(float x);
-long double    acoshl(long double x);
-
-floating_point asinh (arithmetic x);
-float          asinhf(float x);
-long double    asinhl(long double x);
-
-floating_point atanh (arithmetic x);
-float          atanhf(float x);
-long double    atanhl(long double x);
-
-floating_point cbrt (arithmetic x);
-float          cbrtf(float x);
-long double    cbrtl(long double x);
-
-floating_point copysign (arithmetic x, arithmetic y);
-float          copysignf(float x, float y);
-long double    copysignl(long double x, long double y);
-
-floating_point erf (arithmetic x);
-float          erff(float x);
-long double    erfl(long double x);
-
-floating_point erfc (arithmetic x);
-float          erfcf(float x);
-long double    erfcl(long double x);
-
-floating_point exp2 (arithmetic x);
-float          exp2f(float x);
-long double    exp2l(long double x);
-
-floating_point expm1 (arithmetic x);
-float          expm1f(float x);
-long double    expm1l(long double x);
-
-floating_point fdim (arithmetic x, arithmetic y);
-float          fdimf(float x, float y);
-long double    fdiml(long double x, long double y);
-
-floating_point fma (arithmetic x, arithmetic y, arithmetic z);
-float          fmaf(float x, float y, float z);
-long double    fmal(long double x, long double y, long double z);
-
-floating_point fmax (arithmetic x, arithmetic y);
-float          fmaxf(float x, float y);
-long double    fmaxl(long double x, long double y);
-
-floating_point fmin (arithmetic x, arithmetic y);
-float          fminf(float x, float y);
-long double    fminl(long double x, long double y);
-
-floating_point hypot (arithmetic x, arithmetic y);
-float          hypotf(float x, float y);
-long double    hypotl(long double x, long double y);
-
-int ilogb (arithmetic x);
-int ilogbf(float x);
-int ilogbl(long double x);
-
-floating_point lgamma (arithmetic x);
-float          lgammaf(float x);
-long double    lgammal(long double x);
-
-long long llrint (arithmetic x);
-long long llrintf(float x);
-long long llrintl(long double x);
-
-long long llround (arithmetic x);
-long long llroundf(float x);
-long long llroundl(long double x);
-
-floating_point log1p (arithmetic x);
-float          log1pf(float x);
-long double    log1pl(long double x);
-
-floating_point log2 (arithmetic x);
-float          log2f(float x);
-long double    log2l(long double x);
-
-floating_point logb (arithmetic x);
-float          logbf(float x);
-long double    logbl(long double x);
-
-long lrint (arithmetic x);
-long lrintf(float x);
-long lrintl(long double x);
-
-long lround (arithmetic x);
-long lroundf(float x);
-long lroundl(long double x);
-
-double      nan (const char* str);
-float       nanf(const char* str);
-long double nanl(const char* str);
-
-floating_point nearbyint (arithmetic x);
-float          nearbyintf(float x);
-long double    nearbyintl(long double x);
-
-floating_point nextafter (arithmetic x, arithmetic y);
-float          nextafterf(float x, float y);
-long double    nextafterl(long double x, long double y);
-
-floating_point nexttoward (arithmetic x, long double y);
-float          nexttowardf(float x, long double y);
-long double    nexttowardl(long double x, long double y);
-
-floating_point remainder (arithmetic x, arithmetic y);
-float          remainderf(float x, float y);
-long double    remainderl(long double x, long double y);
-
-floating_point remquo (arithmetic x, arithmetic y, int* pquo);
-float          remquof(float x, float y, int* pquo);
-long double    remquol(long double x, long double y, int* pquo);
-
-floating_point rint (arithmetic x);
-float          rintf(float x);
-long double    rintl(long double x);
-
-floating_point round (arithmetic x);
-float          roundf(float x);
-long double    roundl(long double x);
-
-floating_point scalbln (arithmetic x, long ex);
-float          scalblnf(float x, long ex);
-long double    scalblnl(long double x, long ex);
-
-floating_point scalbn (arithmetic x, int ex);
-float          scalbnf(float x, int ex);
-long double    scalbnl(long double x, int ex);
-
-floating_point tgamma (arithmetic x);
-float          tgammaf(float x);
-long double    tgammal(long double x);
-
-floating_point trunc (arithmetic x);
-float          truncf(float x);
-long double    truncl(long double x);
-
-*/
-
-#  include <cuda/std/detail/__config>
-
-#  if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#    pragma GCC system_header
-#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#    pragma clang system_header
-#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#    pragma system_header
-#  endif // no system header
-
-#  define _LIBCUDACXX_STDLIB_INCLUDE_NEXT
-#  include <math.h>
-#  include <stdlib.h>
-
-#  ifdef __cplusplus
-
-// We support including .h headers inside 'extern "C"' contexts, so switch
-// back to C++ linkage before including these C++ headers.
-extern "C++" {
-
-#    include <limits>
-#    include <type_traits>
-
-// signbit
-
-#    ifdef signbit
-
-template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_signbit(_A1 __lcpp_x) noexcept
-{
-  return signbit(__lcpp_x);
-}
-
-#      undef signbit
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_floating_point<_A1>::value, bool>::type
-signbit(_A1 __lcpp_x) noexcept
-{
-  return __libcpp_signbit((typename std::__promote<_A1>::type) __lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_integral<_A1>::value && std::is_signed<_A1>::value, bool>::type
-signbit(_A1 __lcpp_x) noexcept
-{
-  return __lcpp_x < 0;
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_integral<_A1>::value && !std::is_signed<_A1>::value, bool>::type
-signbit(_A1) noexcept
-{
-  return false;
-}
-
-#    elif defined(_LIBCUDACXX_MSVCRT)
-
-template <typename _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_floating_point<_A1>::value, bool>::type
-signbit(_A1 __lcpp_x) noexcept
-{
-  return ::signbit(static_cast<typename std::__promote<_A1>::type>(__lcpp_x));
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_integral<_A1>::value && std::is_signed<_A1>::value, bool>::type
-signbit(_A1 __lcpp_x) noexcept
-{
-  return __lcpp_x < 0;
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_integral<_A1>::value && !std::is_signed<_A1>::value, bool>::type
-signbit(_A1) noexcept
-{
-  return false;
-}
-
-#    endif // signbit
-
-// fpclassify
-
-#    ifdef fpclassify
-
-template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY int __libcpp_fpclassify(_A1 __lcpp_x) noexcept
-{
-  return fpclassify(__lcpp_x);
-}
-
-#      undef fpclassify
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_floating_point<_A1>::value, int>::type
-fpclassify(_A1 __lcpp_x) noexcept
-{
-  return __libcpp_fpclassify((typename std::__promote<_A1>::type) __lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, int>::type
-fpclassify(_A1 __lcpp_x) noexcept
-{
-  return __lcpp_x == 0 ? FP_ZERO : FP_NORMAL;
-}
-
-#    elif defined(_LIBCUDACXX_MSVCRT)
-
-template <typename _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_floating_point<_A1>::value, bool>::type
-fpclassify(_A1 __lcpp_x) noexcept
-{
-  return ::fpclassify(static_cast<typename std::__promote<_A1>::type>(__lcpp_x));
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, int>::type
-fpclassify(_A1 __lcpp_x) noexcept
-{
-  return __lcpp_x == 0 ? FP_ZERO : FP_NORMAL;
-}
-
-#    endif // fpclassify
-
-// isfinite
-
-#    ifdef isfinite
-
-template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_isfinite(_A1 __lcpp_x) noexcept
-{
-  return isfinite(__lcpp_x);
-}
-
-#      undef isfinite
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && std::numeric_limits<_A1>::has_infinity, bool>::type
-isfinite(_A1 __lcpp_x) noexcept
-{
-  return __libcpp_isfinite((typename std::__promote<_A1>::type) __lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && !std::numeric_limits<_A1>::has_infinity, bool>::type
-isfinite(_A1) noexcept
-{
-  return true;
-}
-
-#    endif // isfinite
-
-// isinf
-
-#    ifdef isinf
-
-template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_isinf(_A1 __lcpp_x) noexcept
-{
-  return isinf(__lcpp_x);
-}
-
-#      undef isinf
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && std::numeric_limits<_A1>::has_infinity, bool>::type
-isinf(_A1 __lcpp_x) noexcept
-{
-  return __libcpp_isinf((typename std::__promote<_A1>::type) __lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && !std::numeric_limits<_A1>::has_infinity, bool>::type
-isinf(_A1) noexcept
-{
-  return false;
-}
-
-#      ifdef _LIBCUDACXX_PREFERRED_OVERLOAD
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isinf(float __lcpp_x) noexcept
-{
-  return __libcpp_isinf(__lcpp_x);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD bool isinf(double __lcpp_x) noexcept
-{
-  return __libcpp_isinf(__lcpp_x);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isinf(long double __lcpp_x) noexcept
-{
-  return __libcpp_isinf(__lcpp_x);
-}
-#      endif
-
-#    endif // isinf
-
-// isnan
-
-#    ifdef isnan
-
-template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_isnan(_A1 __lcpp_x) noexcept
-{
-  return isnan(__lcpp_x);
-}
-
-#      undef isnan
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_floating_point<_A1>::value, bool>::type
-isnan(_A1 __lcpp_x) noexcept
-{
-  return __libcpp_isnan((typename std::__promote<_A1>::type) __lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, bool>::type
-isnan(_A1) noexcept
-{
-  return false;
-}
-
-#      ifdef _LIBCUDACXX_PREFERRED_OVERLOAD
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isnan(float __lcpp_x) noexcept
-{
-  return __libcpp_isnan(__lcpp_x);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD bool isnan(double __lcpp_x) noexcept
-{
-  return __libcpp_isnan(__lcpp_x);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool isnan(long double __lcpp_x) noexcept
-{
-  return __libcpp_isnan(__lcpp_x);
-}
-#      endif
-
-#    endif // isnan
-
-// isnormal
-
-#    ifdef isnormal
-
-template <class _A1>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_isnormal(_A1 __lcpp_x) noexcept
-{
-  return isnormal(__lcpp_x);
-}
-
-#      undef isnormal
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_floating_point<_A1>::value, bool>::type
-isnormal(_A1 __lcpp_x) noexcept
-{
-  return __libcpp_isnormal((typename std::__promote<_A1>::type) __lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, bool>::type
-isnormal(_A1 __lcpp_x) noexcept
-{
-  return __lcpp_x != 0;
-}
-
-#    endif // isnormal
-
-// isgreater
-
-#    ifdef isgreater
-
-template <class _A1, class _A2>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_isgreater(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  return isgreater(__lcpp_x, __lcpp_y);
-}
-
-#      undef isgreater
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, bool>::type
-isgreater(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type type;
-  return __libcpp_isgreater((type) __lcpp_x, (type) __lcpp_y);
-}
-
-#    endif // isgreater
-
-// isgreaterequal
-
-#    ifdef isgreaterequal
-
-template <class _A1, class _A2>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_isgreaterequal(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  return isgreaterequal(__lcpp_x, __lcpp_y);
-}
-
-#      undef isgreaterequal
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, bool>::type
-isgreaterequal(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type type;
-  return __libcpp_isgreaterequal((type) __lcpp_x, (type) __lcpp_y);
-}
-
-#    endif // isgreaterequal
-
-// isless
-
-#    ifdef isless
-
-template <class _A1, class _A2>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_isless(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  return isless(__lcpp_x, __lcpp_y);
-}
-
-#      undef isless
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, bool>::type
-isless(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type type;
-  return __libcpp_isless((type) __lcpp_x, (type) __lcpp_y);
-}
-
-#    endif // isless
-
-// islessequal
-
-#    ifdef islessequal
-
-template <class _A1, class _A2>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_islessequal(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  return islessequal(__lcpp_x, __lcpp_y);
-}
-
-#      undef islessequal
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, bool>::type
-islessequal(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type type;
-  return __libcpp_islessequal((type) __lcpp_x, (type) __lcpp_y);
-}
-
-#    endif // islessequal
-
-// islessgreater
-
-#    ifdef islessgreater
-
-template <class _A1, class _A2>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_islessgreater(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  return islessgreater(__lcpp_x, __lcpp_y);
-}
-
-#      undef islessgreater
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, bool>::type
-islessgreater(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type type;
-  return __libcpp_islessgreater((type) __lcpp_x, (type) __lcpp_y);
-}
-
-#    endif // islessgreater
-
-// isunordered
-
-#    ifdef isunordered
-
-template <class _A1, class _A2>
-_LIBCUDACXX_INLINE_VISIBILITY bool __libcpp_isunordered(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  return isunordered(__lcpp_x, __lcpp_y);
-}
-
-#      undef isunordered
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::enable_if<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, bool>::type
-isunordered(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type type;
-  return __libcpp_isunordered((type) __lcpp_x, (type) __lcpp_y);
-}
-
-#    endif // isunordered
-
-// abs
-
-#    undef abs
-#    undef labs
-#    ifndef _LIBCUDACXX_HAS_NO_LONG_LONG
-#      undef llabs
-#    endif
-
-// MSVCRT already has the correct prototype in <stdlib.h> if __cplusplus is defined
-#    if !defined(_LIBCUDACXX_MSVCRT) && !defined(__sun__) && !defined(_AIX)
-inline _LIBCUDACXX_INLINE_VISIBILITY long abs(long __x) noexcept
-{
-  return ::labs(__x);
-}
-#      ifndef _LIBCUDACXX_HAS_NO_LONG_LONG
-inline _LIBCUDACXX_INLINE_VISIBILITY long long abs(long long __x) noexcept
-{
-  return ::llabs(__x);
-}
-#      endif // _LIBCUDACXX_HAS_NO_LONG_LONG
-#    endif // !defined(_LIBCUDACXX_MSVCRT) && !defined(__sun__) && !defined(_AIX)
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float abs(float __lcpp_x) noexcept
-{
-  return ::fabsf(__lcpp_x);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY double abs(double __lcpp_x) noexcept
-{
-  return ::fabs(__lcpp_x);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY long double abs(long double __lcpp_x) noexcept
-{
-  return ::fabsl(__lcpp_x);
-}
-#    endif // !(defined(_AIX) || defined(__sun__))
-
-// div
-
-#    undef div
-#    undef ldiv
-#    ifndef _LIBCUDACXX_HAS_NO_LONG_LONG
-#      undef lldiv
-#    endif
-
-// MSVCRT already has the correct prototype in <stdlib.h> if __cplusplus is defined
-#    if !defined(_LIBCUDACXX_MSVCRT) && !defined(__sun__) && !defined(_AIX)
-inline _LIBCUDACXX_INLINE_VISIBILITY ldiv_t div(long __x, long __y) noexcept
-{
-  return ::ldiv(__x, __y);
-}
-#      ifndef _LIBCUDACXX_HAS_NO_LONG_LONG
-inline _LIBCUDACXX_INLINE_VISIBILITY lldiv_t div(long long __x, long long __y) noexcept
-{
-  return ::lldiv(__x, __y);
-}
-#      endif // _LIBCUDACXX_HAS_NO_LONG_LONG
-#    endif // _LIBCUDACXX_MSVCRT / __sun__ / _AIX
-
-// acos
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float acos(float __lcpp_x) noexcept
-{
-  return ::acosf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double acos(long double __lcpp_x) noexcept
-{
-  return ::acosl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-acos(_A1 __lcpp_x) noexcept
-{
-  return ::acos((double) __lcpp_x);
-}
-
-// asin
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float asin(float __lcpp_x) noexcept
-{
-  return ::asinf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double asin(long double __lcpp_x) noexcept
-{
-  return ::asinl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-asin(_A1 __lcpp_x) noexcept
-{
-  return ::asin((double) __lcpp_x);
-}
-
-// atan
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float atan(float __lcpp_x) noexcept
-{
-  return ::atanf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double atan(long double __lcpp_x) noexcept
-{
-  return ::atanl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-atan(_A1 __lcpp_x) noexcept
-{
-  return ::atan((double) __lcpp_x);
-}
-
-// atan2
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float atan2(float __lcpp_y, float __lcpp_x) noexcept
-{
-  return ::atan2f(__lcpp_y, __lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double atan2(long double __lcpp_y, long double __lcpp_x) noexcept
-{
-  return ::atan2l(__lcpp_y, __lcpp_x);
-}
-#    endif
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-atan2(_A1 __lcpp_y, _A2 __lcpp_x) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::atan2((__result_type) __lcpp_y, (__result_type) __lcpp_x);
-}
-
-// ceil
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float ceil(float __lcpp_x) noexcept
-{
-  return ::ceilf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double ceil(long double __lcpp_x) noexcept
-{
-  return ::ceill(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-ceil(_A1 __lcpp_x) noexcept
-{
-  return ::ceil((double) __lcpp_x);
-}
-
-// cos
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float cos(float __lcpp_x) noexcept
-{
-  return ::cosf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double cos(long double __lcpp_x) noexcept
-{
-  return ::cosl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-cos(_A1 __lcpp_x) noexcept
-{
-  return ::cos((double) __lcpp_x);
-}
-
-// cosh
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float cosh(float __lcpp_x) noexcept
-{
-  return ::coshf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double cosh(long double __lcpp_x) noexcept
-{
-  return ::coshl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-cosh(_A1 __lcpp_x) noexcept
-{
-  return ::cosh((double) __lcpp_x);
-}
-
-// exp
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float exp(float __lcpp_x) noexcept
-{
-  return ::expf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double exp(long double __lcpp_x) noexcept
-{
-  return ::expl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-exp(_A1 __lcpp_x) noexcept
-{
-  return ::exp((double) __lcpp_x);
-}
-
-// fabs
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float fabs(float __lcpp_x) noexcept
-{
-  return ::fabsf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double fabs(long double __lcpp_x) noexcept
-{
-  return ::fabsl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-fabs(_A1 __lcpp_x) noexcept
-{
-  return ::fabs((double) __lcpp_x);
-}
-
-// floor
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float floor(float __lcpp_x) noexcept
-{
-  return ::floorf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double floor(long double __lcpp_x) noexcept
-{
-  return ::floorl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-floor(_A1 __lcpp_x) noexcept
-{
-  return ::floor((double) __lcpp_x);
-}
-
-// fmod
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float fmod(float __lcpp_x, float __lcpp_y) noexcept
-{
-  return ::fmodf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double fmod(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::fmodl(__lcpp_x, __lcpp_y);
-}
-#    endif
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-fmod(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::fmod((__result_type) __lcpp_x, (__result_type) __lcpp_y);
-}
-
-// frexp
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float frexp(float __lcpp_x, int* __lcpp_e) noexcept
-{
-  return ::frexpf(__lcpp_x, __lcpp_e);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double frexp(long double __lcpp_x, int* __lcpp_e) noexcept
-{
-  return ::frexpl(__lcpp_x, __lcpp_e);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-frexp(_A1 __lcpp_x, int* __lcpp_e) noexcept
-{
-  return ::frexp((double) __lcpp_x, __lcpp_e);
-}
-
-// ldexp
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float ldexp(float __lcpp_x, int __lcpp_e) noexcept
-{
-  return ::ldexpf(__lcpp_x, __lcpp_e);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double ldexp(long double __lcpp_x, int __lcpp_e) noexcept
-{
-  return ::ldexpl(__lcpp_x, __lcpp_e);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-ldexp(_A1 __lcpp_x, int __lcpp_e) noexcept
-{
-  return ::ldexp((double) __lcpp_x, __lcpp_e);
-}
-
-// log
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float log(float __lcpp_x) noexcept
-{
-  return ::logf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double log(long double __lcpp_x) noexcept
-{
-  return ::logl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-log(_A1 __lcpp_x) noexcept
-{
-  return ::log((double) __lcpp_x);
-}
-
-// log10
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float log10(float __lcpp_x) noexcept
-{
-  return ::log10f(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double log10(long double __lcpp_x) noexcept
-{
-  return ::log10l(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-log10(_A1 __lcpp_x) noexcept
-{
-  return ::log10((double) __lcpp_x);
-}
-
-// modf
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float modf(float __lcpp_x, float* __lcpp_y) noexcept
-{
-  return ::modff(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double modf(long double __lcpp_x, long double* __lcpp_y) noexcept
-{
-  return ::modfl(__lcpp_x, __lcpp_y);
-}
-#    endif
-
-// pow
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float pow(float __lcpp_x, float __lcpp_y) noexcept
-{
-  return ::powf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double pow(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::powl(__lcpp_x, __lcpp_y);
-}
-#    endif
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-pow(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::pow((__result_type) __lcpp_x, (__result_type) __lcpp_y);
-}
-
-// sin
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float sin(float __lcpp_x) noexcept
-{
-  return ::sinf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double sin(long double __lcpp_x) noexcept
-{
-  return ::sinl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-sin(_A1 __lcpp_x) noexcept
-{
-  return ::sin((double) __lcpp_x);
-}
-
-// sinh
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float sinh(float __lcpp_x) noexcept
-{
-  return ::sinhf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double sinh(long double __lcpp_x) noexcept
-{
-  return ::sinhl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-sinh(_A1 __lcpp_x) noexcept
-{
-  return ::sinh((double) __lcpp_x);
-}
-
-// sqrt
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float sqrt(float __lcpp_x) noexcept
-{
-  return ::sqrtf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double sqrt(long double __lcpp_x) noexcept
-{
-  return ::sqrtl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-sqrt(_A1 __lcpp_x) noexcept
-{
-  return ::sqrt((double) __lcpp_x);
-}
-
-// tan
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float tan(float __lcpp_x) noexcept
-{
-  return ::tanf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double tan(long double __lcpp_x) noexcept
-{
-  return ::tanl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-tan(_A1 __lcpp_x) noexcept
-{
-  return ::tan((double) __lcpp_x);
-}
-
-// tanh
-
-#    if !(defined(_AIX) || defined(__sun__))
-inline _LIBCUDACXX_INLINE_VISIBILITY float tanh(float __lcpp_x) noexcept
-{
-  return ::tanhf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double tanh(long double __lcpp_x) noexcept
-{
-  return ::tanhl(__lcpp_x);
-}
-#    endif
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-tanh(_A1 __lcpp_x) noexcept
-{
-  return ::tanh((double) __lcpp_x);
-}
-
-// acosh
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float acosh(float __lcpp_x) noexcept
-{
-  return ::acoshf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double acosh(long double __lcpp_x) noexcept
-{
-  return ::acoshl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-acosh(_A1 __lcpp_x) noexcept
-{
-  return ::acosh((double) __lcpp_x);
-}
-
-// asinh
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float asinh(float __lcpp_x) noexcept
-{
-  return ::asinhf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double asinh(long double __lcpp_x) noexcept
-{
-  return ::asinhl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-asinh(_A1 __lcpp_x) noexcept
-{
-  return ::asinh((double) __lcpp_x);
-}
-
-// atanh
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float atanh(float __lcpp_x) noexcept
-{
-  return ::atanhf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double atanh(long double __lcpp_x) noexcept
-{
-  return ::atanhl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-atanh(_A1 __lcpp_x) noexcept
-{
-  return ::atanh((double) __lcpp_x);
-}
-
-// cbrt
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float cbrt(float __lcpp_x) noexcept
-{
-  return ::cbrtf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double cbrt(long double __lcpp_x) noexcept
-{
-  return ::cbrtl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-cbrt(_A1 __lcpp_x) noexcept
-{
-  return ::cbrt((double) __lcpp_x);
-}
-
-// copysign
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float copysign(float __lcpp_x, float __lcpp_y) noexcept
-{
-  return ::copysignf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double copysign(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::copysignl(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-copysign(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::copysign((__result_type) __lcpp_x, (__result_type) __lcpp_y);
-}
-
-// erf
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float erf(float __lcpp_x) noexcept
-{
-  return ::erff(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double erf(long double __lcpp_x) noexcept
-{
-  return ::erfl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-erf(_A1 __lcpp_x) noexcept
-{
-  return ::erf((double) __lcpp_x);
-}
-
-// erfc
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float erfc(float __lcpp_x) noexcept
-{
-  return ::erfcf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double erfc(long double __lcpp_x) noexcept
-{
-  return ::erfcl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-erfc(_A1 __lcpp_x) noexcept
-{
-  return ::erfc((double) __lcpp_x);
-}
-
-// exp2
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float exp2(float __lcpp_x) noexcept
-{
-  return ::exp2f(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double exp2(long double __lcpp_x) noexcept
-{
-  return ::exp2l(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-exp2(_A1 __lcpp_x) noexcept
-{
-  return ::exp2((double) __lcpp_x);
-}
-
-// expm1
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float expm1(float __lcpp_x) noexcept
-{
-  return ::expm1f(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double expm1(long double __lcpp_x) noexcept
-{
-  return ::expm1l(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-expm1(_A1 __lcpp_x) noexcept
-{
-  return ::expm1((double) __lcpp_x);
-}
-
-// fdim
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float fdim(float __lcpp_x, float __lcpp_y) noexcept
-{
-  return ::fdimf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double fdim(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::fdiml(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-fdim(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::fdim((__result_type) __lcpp_x, (__result_type) __lcpp_y);
-}
-
-// fma
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float fma(float __lcpp_x, float __lcpp_y, float __lcpp_z) noexcept
-{
-  return ::fmaf(__lcpp_x, __lcpp_y, __lcpp_z);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double
-fma(long double __lcpp_x, long double __lcpp_y, long double __lcpp_z) noexcept
-{
-  return ::fmal(__lcpp_x, __lcpp_y, __lcpp_z);
-}
-
-template <class _A1, class _A2, class _A3>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::_EnableIf<
-  std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value && std::is_arithmetic<_A3>::value,
-  std::__promote<_A1, _A2, _A3>>::type
-fma(_A1 __lcpp_x, _A2 __lcpp_y, _A3 __lcpp_z) noexcept
-{
-  typedef typename std::__promote<_A1, _A2, _A3>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value
-                   && std::_IsSame<_A3, __result_type>::value)),
-                "");
-  return ::fma((__result_type) __lcpp_x, (__result_type) __lcpp_y, (__result_type) __lcpp_z);
-}
-
-// fmax
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float fmax(float __lcpp_x, float __lcpp_y) noexcept
-{
-  return ::fmaxf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double fmax(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::fmaxl(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-fmax(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::fmax((__result_type) __lcpp_x, (__result_type) __lcpp_y);
-}
-
-// fmin
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float fmin(float __lcpp_x, float __lcpp_y) noexcept
-{
-  return ::fminf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double fmin(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::fminl(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-fmin(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::fmin((__result_type) __lcpp_x, (__result_type) __lcpp_y);
-}
-
-// hypot
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float hypot(float __lcpp_x, float __lcpp_y) noexcept
-{
-  return ::hypotf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double hypot(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::hypotl(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-hypot(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::hypot((__result_type) __lcpp_x, (__result_type) __lcpp_y);
-}
-
-// ilogb
-
-inline _LIBCUDACXX_INLINE_VISIBILITY int ilogb(float __lcpp_x) noexcept
-{
-  return ::ilogbf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY int ilogb(long double __lcpp_x) noexcept
-{
-  return ::ilogbl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, int>::type
-ilogb(_A1 __lcpp_x) noexcept
-{
-  return ::ilogb((double) __lcpp_x);
-}
-
-// lgamma
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float lgamma(float __lcpp_x) noexcept
-{
-  return ::lgammaf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double lgamma(long double __lcpp_x) noexcept
-{
-  return ::lgammal(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-lgamma(_A1 __lcpp_x) noexcept
-{
-  return ::lgamma((double) __lcpp_x);
-}
-
-// llrint
-
-inline _LIBCUDACXX_INLINE_VISIBILITY long long llrint(float __lcpp_x) noexcept
-{
-  return ::llrintf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long long llrint(long double __lcpp_x) noexcept
-{
-  return ::llrintl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, long long>::type
-llrint(_A1 __lcpp_x) noexcept
-{
-  return ::llrint((double) __lcpp_x);
-}
-
-// llround
-
-inline _LIBCUDACXX_INLINE_VISIBILITY long long llround(float __lcpp_x) noexcept
-{
-  return ::llroundf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long long llround(long double __lcpp_x) noexcept
-{
-  return ::llroundl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, long long>::type
-llround(_A1 __lcpp_x) noexcept
-{
-  return ::llround((double) __lcpp_x);
-}
-
-// log1p
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float log1p(float __lcpp_x) noexcept
-{
-  return ::log1pf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double log1p(long double __lcpp_x) noexcept
-{
-  return ::log1pl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-log1p(_A1 __lcpp_x) noexcept
-{
-  return ::log1p((double) __lcpp_x);
-}
-
-// log2
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float log2(float __lcpp_x) noexcept
-{
-  return ::log2f(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double log2(long double __lcpp_x) noexcept
-{
-  return ::log2l(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-log2(_A1 __lcpp_x) noexcept
-{
-  return ::log2((double) __lcpp_x);
-}
-
-// logb
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float logb(float __lcpp_x) noexcept
-{
-  return ::logbf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double logb(long double __lcpp_x) noexcept
-{
-  return ::logbl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-logb(_A1 __lcpp_x) noexcept
-{
-  return ::logb((double) __lcpp_x);
-}
-
-// lrint
-
-inline _LIBCUDACXX_INLINE_VISIBILITY long lrint(float __lcpp_x) noexcept
-{
-  return ::lrintf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long lrint(long double __lcpp_x) noexcept
-{
-  return ::lrintl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, long>::type
-lrint(_A1 __lcpp_x) noexcept
-{
-  return ::lrint((double) __lcpp_x);
-}
-
-// lround
-
-inline _LIBCUDACXX_INLINE_VISIBILITY long lround(float __lcpp_x) noexcept
-{
-  return ::lroundf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long lround(long double __lcpp_x) noexcept
-{
-  return ::lroundl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, long>::type
-lround(_A1 __lcpp_x) noexcept
-{
-  return ::lround((double) __lcpp_x);
-}
-
-// nan
-
-// nearbyint
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float nearbyint(float __lcpp_x) noexcept
-{
-  return ::nearbyintf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double nearbyint(long double __lcpp_x) noexcept
-{
-  return ::nearbyintl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-nearbyint(_A1 __lcpp_x) noexcept
-{
-  return ::nearbyint((double) __lcpp_x);
-}
-
-// nextafter
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float nextafter(float __lcpp_x, float __lcpp_y) noexcept
-{
-  return ::nextafterf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double nextafter(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::nextafterl(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-nextafter(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::nextafter((__result_type) __lcpp_x, (__result_type) __lcpp_y);
-}
-
-// nexttoward
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float nexttoward(float __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::nexttowardf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double nexttoward(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::nexttowardl(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-nexttoward(_A1 __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::nexttoward((double) __lcpp_x, __lcpp_y);
-}
-
-// remainder
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float remainder(float __lcpp_x, float __lcpp_y) noexcept
-{
-  return ::remainderf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double remainder(long double __lcpp_x, long double __lcpp_y) noexcept
-{
-  return ::remainderl(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-remainder(_A1 __lcpp_x, _A2 __lcpp_y) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::remainder((__result_type) __lcpp_x, (__result_type) __lcpp_y);
-}
-
-// remquo
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float remquo(float __lcpp_x, float __lcpp_y, int* __lcpp_z) noexcept
-{
-  return ::remquof(__lcpp_x, __lcpp_y, __lcpp_z);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double
-remquo(long double __lcpp_x, long double __lcpp_y, int* __lcpp_z) noexcept
-{
-  return ::remquol(__lcpp_x, __lcpp_y, __lcpp_z);
-}
-
-template <class _A1, class _A2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename std::_EnableIf<std::is_arithmetic<_A1>::value && std::is_arithmetic<_A2>::value, std::__promote<_A1, _A2>>::type
-remquo(_A1 __lcpp_x, _A2 __lcpp_y, int* __lcpp_z) noexcept
-{
-  typedef typename std::__promote<_A1, _A2>::type __result_type;
-  static_assert((!(std::_IsSame<_A1, __result_type>::value && std::_IsSame<_A2, __result_type>::value)), "");
-  return ::remquo((__result_type) __lcpp_x, (__result_type) __lcpp_y, __lcpp_z);
-}
-
-// rint
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float rint(float __lcpp_x) noexcept
-{
-  return ::rintf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double rint(long double __lcpp_x) noexcept
-{
-  return ::rintl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-rint(_A1 __lcpp_x) noexcept
-{
-  return ::rint((double) __lcpp_x);
-}
-
-// round
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float round(float __lcpp_x) noexcept
-{
-  return ::roundf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double round(long double __lcpp_x) noexcept
-{
-  return ::roundl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-round(_A1 __lcpp_x) noexcept
-{
-  return ::round((double) __lcpp_x);
-}
-
-// scalbln
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float scalbln(float __lcpp_x, long __lcpp_y) noexcept
-{
-  return ::scalblnf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double scalbln(long double __lcpp_x, long __lcpp_y) noexcept
-{
-  return ::scalblnl(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-scalbln(_A1 __lcpp_x, long __lcpp_y) noexcept
-{
-  return ::scalbln((double) __lcpp_x, __lcpp_y);
-}
-
-// scalbn
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float scalbn(float __lcpp_x, int __lcpp_y) noexcept
-{
-  return ::scalbnf(__lcpp_x, __lcpp_y);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double scalbn(long double __lcpp_x, int __lcpp_y) noexcept
-{
-  return ::scalbnl(__lcpp_x, __lcpp_y);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-scalbn(_A1 __lcpp_x, int __lcpp_y) noexcept
-{
-  return ::scalbn((double) __lcpp_x, __lcpp_y);
-}
-
-// tgamma
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float tgamma(float __lcpp_x) noexcept
-{
-  return ::tgammaf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double tgamma(long double __lcpp_x) noexcept
-{
-  return ::tgammal(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-tgamma(_A1 __lcpp_x) noexcept
-{
-  return ::tgamma((double) __lcpp_x);
-}
-
-// trunc
-
-inline _LIBCUDACXX_INLINE_VISIBILITY float trunc(float __lcpp_x) noexcept
-{
-  return ::truncf(__lcpp_x);
-}
-inline _LIBCUDACXX_INLINE_VISIBILITY long double trunc(long double __lcpp_x) noexcept
-{
-  return ::truncl(__lcpp_x);
-}
-
-template <class _A1>
-inline _LIBCUDACXX_INLINE_VISIBILITY typename std::enable_if<std::is_integral<_A1>::value, double>::type
-trunc(_A1 __lcpp_x) noexcept
-{
-  return ::trunc((double) __lcpp_x);
-}
-
-} // extern "C++"
-
-#  endif // __cplusplus
-
-#else // _LIBCUDACXX_MATH_H
-
-// This include lives outside the header guard in order to support an MSVC
-// extension which allows users to do:
-//
-// #define _USE_MATH_DEFINES
-// #include <math.h>
-//
-// and receive the definitions of mathematical constants, even if <math.h>
-// has previously been included.
-#  if defined(_LIBCUDACXX_MSVCRT) && defined(_USE_MATH_DEFINES)
-#    include_next <math.h>
-#  endif
-
-#endif // _LIBCUDACXX_MATH_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/mdspan b/libcudacxx/include/cuda/std/detail/libcxx/include/mdspan
deleted file mode 100644
index 8fe0e09d0a..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/mdspan
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef _LIBCUDACXX_MDSPAN
-#define _LIBCUDACXX_MDSPAN
-
-#include <cuda/std/detail/__config>
-
-#include <cuda/std/__mdspan/default_accessor.h>
-#include <cuda/std/__mdspan/dynamic_extent.h>
-#include <cuda/std/__mdspan/extents.h>
-#include <cuda/std/__mdspan/full_extent_t.h>
-#include <cuda/std/__mdspan/layout_left.h>
-#include <cuda/std/__mdspan/layout_right.h>
-#include <cuda/std/__mdspan/layout_stride.h>
-#include <cuda/std/__mdspan/macros.h>
-#include <cuda/std/__mdspan/mdspan.h>
-#include <cuda/std/__mdspan/static_array.h>
-#include <cuda/std/__mdspan/submdspan.h>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/version>
-
-#endif // _LIBCUDACXX_MDSPAN
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/memory b/libcudacxx/include/cuda/std/detail/libcxx/include/memory
deleted file mode 100644
index c974532a02..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/memory
+++ /dev/null
@@ -1,50 +0,0 @@
-// -*- C++ -*-
-//===-------------------------- memory ------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_MEMORY
-#define _LIBCUDACXX_MEMORY
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__memory/addressof.h>
-#include <cuda/std/__memory/align.h>
-#include <cuda/std/__memory/allocate_at_least.h>
-#include <cuda/std/__memory/allocation_guard.h>
-#include <cuda/std/__memory/allocator.h>
-#include <cuda/std/__memory/allocator_arg_t.h>
-#include <cuda/std/__memory/allocator_traits.h>
-#include <cuda/std/__memory/construct_at.h>
-#include <cuda/std/__memory/pointer_traits.h>
-#include <cuda/std/__memory/uninitialized_algorithms.h>
-#include <cuda/std/__memory/unique_ptr.h>
-#include <cuda/std/__memory/uses_allocator.h>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-
-// standard-mandated includes
-#include <cuda/std/version>
-
-// [memory.syn]
-#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-#  include <cuda/std/compare>
-#endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-
-#if defined(_LIBCUDACXX_HAS_PARALLEL_ALGORITHMS) && _CCCL_STD_VER >= 2017
-#  include <__pstl_memory>
-#endif
-
-#endif // _LIBCUDACXX_MEMORY
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/new b/libcudacxx/include/cuda/std/detail/libcxx/include/new
deleted file mode 100644
index a9ce3e694a..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/new
+++ /dev/null
@@ -1,59 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_NEW
-#define _LIBCUDACXX_NEW
-
-/*
-    new synopsis
-
-namespace std
-{
-
-class bad_alloc
-    : public exception
-{
-public:
-    bad_alloc() noexcept;
-    bad_alloc(const bad_alloc&) noexcept;
-    bad_alloc& operator=(const bad_alloc&) noexcept;
-    virtual const char* what() const noexcept;
-};
-
-class bad_array_new_length : public bad_alloc // C++14
-{
-public:
-    bad_array_new_length() noexcept;
-};
-
-// 21.6.4, pointer optimization barrier
-template <class T> constexpr T* launder(T* p) noexcept; // C++17
-}  // std
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__new/allocate.h>
-#include <cuda/std/__new/bad_alloc.h>
-#include <cuda/std/__new/launder.h>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/version>
-
-#endif // _LIBCUDACXX_NEW
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/optional b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
index 75190c3999..29e2d0f91b 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/optional
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
@@ -214,7 +214,7 @@ template<class T>
 
 _CCCL_PUSH_MACROS
 
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
 #  ifdef __cpp_lib_optional
 #    include <optional>
 #  else // ^^^ __cpp_lib_optional ^^^ / vvv !__cpp_lib_optional vvv
@@ -228,7 +228,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION
 using ::std::bad_optional_access;
 
 #  else // ^^^ __cpp_lib_optional ^^^ / vvv !__cpp_lib_optional vvv
-class _LIBCUDACXX_TYPE_VIS _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS bad_optional_access : public ::std::exception
+class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS bad_optional_access
+    : public ::std::exception
 {
 public:
   const char* what() const noexcept override
@@ -240,29 +241,29 @@ public:
 
 _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION
 
-#endif // _LIBCUDACXX_NO_EXCEPTIONS
+#endif // !_CCCL_NO_EXCEPTIONS
 
 #if _CCCL_STD_VER > 2011
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_bad_optional_access()
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_bad_optional_access()
 {
-#  ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#  ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(
     NV_IS_HOST, (throw _CUDA_VSTD_NOVERSION::bad_optional_access();), (_CUDA_VSTD_NOVERSION::terminate();))
-#  else
+#  else // ^^^ _CCCL_NO_EXCEPTIONS ^^^ / vvv !_CCCL_NO_EXCEPTIONS vvv
   _CUDA_VSTD_NOVERSION::terminate();
-#  endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#  endif // !_CCCL_NO_EXCEPTIONS
 }
 
 struct nullopt_t
 {
   struct __secret_tag
   {
-    explicit __secret_tag() = default;
+    _CCCL_HIDE_FROM_ABI explicit __secret_tag() = default;
   };
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit nullopt_t(__secret_tag, __secret_tag) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit nullopt_t(__secret_tag, __secret_tag) noexcept {}
 };
 
 _CCCL_GLOBAL_CONSTANT nullopt_t nullopt{nullopt_t::__secret_tag{}, nullopt_t::__secret_tag{}};
@@ -287,7 +288,7 @@ struct __optional_destruct_base<_Tp, false>
   bool __engaged_;
 
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 ~__optional_destruct_base()
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 ~__optional_destruct_base()
   {
     if (__engaged_)
     {
@@ -295,26 +296,26 @@ struct __optional_destruct_base<_Tp, false>
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __optional_destruct_base() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base() noexcept
       : __null_state_()
       , __engaged_(false)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
       : __val_(_CUDA_VSTD::forward<_Args>(__args)...)
       , __engaged_(true)
   {}
 
   template <class _Fp, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __optional_destruct_base(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base(
     __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args)
       : __val_(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fp>(__f), _CUDA_VSTD::forward<_Args>(__args)...))
       , __engaged_(true)
   {}
 
   _CCCL_EXEC_CHECK_DISABLE
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void reset() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void reset() noexcept
   {
     if (__engaged_)
     {
@@ -337,25 +338,25 @@ struct __optional_destruct_base<_Tp, true>
   };
   bool __engaged_;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __optional_destruct_base() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base() noexcept
       : __null_state_()
       , __engaged_(false)
   {}
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
       : __val_(_CUDA_VSTD::forward<_Args>(__args)...)
       , __engaged_(true)
   {}
 
   template <class _Fp, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __optional_destruct_base(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base(
     __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args)
       : __val_(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fp>(__f), _CUDA_VSTD::forward<_Args>(__args)...))
       , __engaged_(true)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void reset() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void reset() noexcept
   {
     if (__engaged_)
     {
@@ -371,30 +372,30 @@ struct __optional_storage_base : __optional_destruct_base<_Tp>
 
   using value_type = _Tp;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool has_value() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool has_value() const noexcept
   {
     return this->__engaged_;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr value_type& __get() & noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr value_type& __get() & noexcept
   {
     return this->__val_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr const value_type& __get() const& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const value_type& __get() const& noexcept
   {
     return this->__val_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr value_type&& __get() && noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr value_type&& __get() && noexcept
   {
     return _CUDA_VSTD::move(this->__val_);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr const value_type&& __get() const&& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const value_type&& __get() const&& noexcept
   {
     return _CUDA_VSTD::move(this->__val_);
   }
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 void __construct(_Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __construct(_Args&&... __args)
   {
     _LIBCUDACXX_ASSERT(!has_value(), "__construct called for engaged __optional_storage");
 #  if _CCCL_STD_VER > 2017
@@ -406,7 +407,7 @@ struct __optional_storage_base : __optional_destruct_base<_Tp>
   }
 
   template <class _That>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void __construct_from(_That&& __opt)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __construct_from(_That&& __opt)
   {
     if (__opt.has_value())
     {
@@ -415,7 +416,7 @@ struct __optional_storage_base : __optional_destruct_base<_Tp>
   }
 
   template <class _That>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void __assign_from(_That&& __opt)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __assign_from(_That&& __opt)
   {
     if (this->__engaged_ == __opt.has_value())
     {
@@ -459,25 +460,25 @@ struct __optional_copy_base<_Tp, __smf_availability::__available> : __optional_s
   // This ctor shouldn't need to initialize the base explicitly, but g++ 9 considers it to be uninitialized
   // during constexpr evaluation if it isn't initialized explicitly. This can be replaced with the pattern
   // below, in __optional_move_base, once g++ 9 falls off our support matrix.
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __optional_copy_base(const __optional_copy_base& __opt)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_copy_base(const __optional_copy_base& __opt)
       : __base()
   {
     this->__construct_from(__opt);
   }
 
-  __optional_copy_base(__optional_copy_base&&)                 = default;
-  __optional_copy_base& operator=(const __optional_copy_base&) = default;
-  __optional_copy_base& operator=(__optional_copy_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_base(__optional_copy_base&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_base& operator=(const __optional_copy_base&) = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_base& operator=(__optional_copy_base&&)      = default;
 };
 
 template <class _Tp>
 struct __optional_copy_base<_Tp, __smf_availability::__deleted> : __optional_storage_base<_Tp>
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__optional_copy_base, __optional_storage_base, _Tp);
-  __optional_copy_base(const __optional_copy_base&)            = delete;
-  __optional_copy_base(__optional_copy_base&&)                 = default;
-  __optional_copy_base& operator=(const __optional_copy_base&) = default;
-  __optional_copy_base& operator=(__optional_copy_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_base(const __optional_copy_base&)            = delete;
+  _CCCL_HIDE_FROM_ABI __optional_copy_base(__optional_copy_base&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_base& operator=(const __optional_copy_base&) = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_base& operator=(__optional_copy_base&&)      = default;
 };
 
 template <class _Tp>
@@ -498,16 +499,16 @@ struct __optional_move_base<_Tp, __smf_availability::__available> : __optional_c
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__optional_move_base, __optional_copy_base, _Tp);
 
-  __optional_move_base(const __optional_move_base&) = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_base(const __optional_move_base&) = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __optional_move_base(__optional_move_base&& __opt) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_move_base(__optional_move_base&& __opt) noexcept(
     _CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
   {
     this->__construct_from(_CUDA_VSTD::move(__opt));
   }
 
-  __optional_move_base& operator=(const __optional_move_base&) = default;
-  __optional_move_base& operator=(__optional_move_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_base& operator=(const __optional_move_base&) = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_base& operator=(__optional_move_base&&)      = default;
 };
 
 template <class _Tp>
@@ -515,10 +516,10 @@ struct __optional_move_base<_Tp, __smf_availability::__deleted> : __optional_cop
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__optional_move_base, __optional_copy_base, _Tp);
 
-  __optional_move_base(const __optional_move_base&)            = default;
-  __optional_move_base(__optional_move_base&&)                 = delete;
-  __optional_move_base& operator=(const __optional_move_base&) = default;
-  __optional_move_base& operator=(__optional_move_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_base(const __optional_move_base&)            = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_base(__optional_move_base&&)                 = delete;
+  _CCCL_HIDE_FROM_ABI __optional_move_base& operator=(const __optional_move_base&) = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_base& operator=(__optional_move_base&&)      = default;
 };
 
 template <class _Tp>
@@ -541,11 +542,10 @@ struct __optional_copy_assign_base<_Tp, __smf_availability::__available> : __opt
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__optional_copy_assign_base, __optional_move_base, _Tp);
 
-  __optional_copy_assign_base(const __optional_copy_assign_base&) = default;
-  __optional_copy_assign_base(__optional_copy_assign_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_assign_base(const __optional_copy_assign_base&) = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_assign_base(__optional_copy_assign_base&&)      = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __optional_copy_assign_base&
-  operator=(const __optional_copy_assign_base& __opt)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_copy_assign_base& operator=(const __optional_copy_assign_base& __opt)
   {
     this->__assign_from(__opt);
     return *this;
@@ -559,10 +559,10 @@ struct __optional_copy_assign_base<_Tp, __smf_availability::__deleted> : __optio
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__optional_copy_assign_base, __optional_move_base, _Tp);
 
-  __optional_copy_assign_base(const __optional_copy_assign_base&)            = default;
-  __optional_copy_assign_base(__optional_copy_assign_base&&)                 = default;
-  __optional_copy_assign_base& operator=(const __optional_copy_assign_base&) = delete;
-  __optional_copy_assign_base& operator=(__optional_copy_assign_base&&)      = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_assign_base(const __optional_copy_assign_base&)            = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_assign_base(__optional_copy_assign_base&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __optional_copy_assign_base& operator=(const __optional_copy_assign_base&) = delete;
+  _CCCL_HIDE_FROM_ABI __optional_copy_assign_base& operator=(__optional_copy_assign_base&&)      = default;
 };
 
 template <class _Tp>
@@ -585,11 +585,11 @@ struct __optional_move_assign_base<_Tp, __smf_availability::__available> : __opt
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__optional_move_assign_base, __optional_copy_assign_base, _Tp);
 
-  __optional_move_assign_base(const __optional_move_assign_base& __opt)      = default;
-  __optional_move_assign_base(__optional_move_assign_base&&)                 = default;
-  __optional_move_assign_base& operator=(const __optional_move_assign_base&) = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_assign_base(const __optional_move_assign_base& __opt)      = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_assign_base(__optional_move_assign_base&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_assign_base& operator=(const __optional_move_assign_base&) = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __optional_move_assign_base&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_move_assign_base&
   operator=(__optional_move_assign_base&& __opt) noexcept(
     _CCCL_TRAIT(is_nothrow_move_assignable, _Tp) && _CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
   {
@@ -603,10 +603,10 @@ struct __optional_move_assign_base<_Tp, __smf_availability::__deleted> : __optio
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__optional_move_assign_base, __optional_copy_assign_base, _Tp);
 
-  __optional_move_assign_base(const __optional_move_assign_base& __opt)      = default;
-  __optional_move_assign_base(__optional_move_assign_base&&)                 = default;
-  __optional_move_assign_base& operator=(const __optional_move_assign_base&) = default;
-  __optional_move_assign_base& operator=(__optional_move_assign_base&&)      = delete;
+  _CCCL_HIDE_FROM_ABI __optional_move_assign_base(const __optional_move_assign_base& __opt)      = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_assign_base(__optional_move_assign_base&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_assign_base& operator=(const __optional_move_assign_base&) = default;
+  _CCCL_HIDE_FROM_ABI __optional_move_assign_base& operator=(__optional_move_assign_base&&)      = delete;
 };
 
 template <class _Tp>
@@ -692,42 +692,42 @@ private:
   static_assert(!_CCCL_TRAIT(is_array, value_type), "instantiation of optional with an array type is ill-formed");
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional() noexcept {}
-  constexpr optional(const optional&) = default;
-  constexpr optional(optional&&)      = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional(nullopt_t) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional() noexcept {}
+  _CCCL_HIDE_FROM_ABI constexpr optional(const optional&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr optional(optional&&)      = default;
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional(nullopt_t) noexcept {}
 
   _LIBCUDACXX_TEMPLATE(class _In_place_t, class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_same, _In_place_t, in_place_t)
                          _LIBCUDACXX_AND _CCCL_TRAIT(is_constructible, value_type, _Args...))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit optional(_In_place_t, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit optional(_In_place_t, _Args&&... __args)
       : __base(in_place, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Up, class... _Args)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_constructible, value_type, initializer_list<_Up>&, _Args...))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit optional(in_place_t, initializer_list<_Up> __il, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit optional(in_place_t, initializer_list<_Up> __il, _Args&&... __args)
       : __base(in_place, __il, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Up = value_type)
   _LIBCUDACXX_REQUIRES(
     __opt_is_constructible_from_U<_Tp, _Up> _LIBCUDACXX_AND __opt_is_implictly_constructible<_Tp, _Up>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional(_Up&& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional(_Up&& __v)
       : __base(in_place, _CUDA_VSTD::forward<_Up>(__v))
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Up)
   _LIBCUDACXX_REQUIRES(
     __opt_is_constructible_from_U<_Tp, _Up> _LIBCUDACXX_AND __opt_is_explictly_constructible<_Tp, _Up>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit optional(_Up&& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit optional(_Up&& __v)
       : __base(in_place, _CUDA_VSTD::forward<_Up>(__v))
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Up)
   _LIBCUDACXX_REQUIRES(
     __opt_is_constructible_from_opt<_Tp, _Up> _LIBCUDACXX_AND __opt_is_implictly_constructible<_Tp, const _Up&>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional(const optional<_Up>& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional(const optional<_Up>& __v)
   {
     this->__construct_from(__v);
   }
@@ -735,7 +735,7 @@ public:
   _LIBCUDACXX_TEMPLATE(class _Up)
   _LIBCUDACXX_REQUIRES(
     __opt_is_constructible_from_opt<_Tp, _Up> _LIBCUDACXX_AND __opt_is_explictly_constructible<_Tp, const _Up&>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit optional(const optional<_Up>& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit optional(const optional<_Up>& __v)
   {
     this->__construct_from(__v);
   }
@@ -743,7 +743,7 @@ public:
   _LIBCUDACXX_TEMPLATE(class _Up)
   _LIBCUDACXX_REQUIRES(
     __opt_is_constructible_from_opt<_Tp, _Up> _LIBCUDACXX_AND __opt_is_implictly_constructible<_Tp, _Up>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional(optional<_Up>&& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional(optional<_Up>&& __v)
   {
     this->__construct_from(_CUDA_VSTD::move(__v));
   }
@@ -751,21 +751,21 @@ public:
   _LIBCUDACXX_TEMPLATE(class _Up)
   _LIBCUDACXX_REQUIRES(
     __opt_is_constructible_from_opt<_Tp, _Up> _LIBCUDACXX_AND __opt_is_explictly_constructible<_Tp, _Up>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit optional(optional<_Up>&& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit optional(optional<_Up>&& __v)
   {
     this->__construct_from(_CUDA_VSTD::move(__v));
   }
 
 private:
   template <class _Fp, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit optional(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit optional(
     __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args)
       : __base(
           __optional_construct_from_invoke_tag{}, _CUDA_VSTD::forward<_Fp>(__f), _CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional& operator=(nullopt_t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional& operator=(nullopt_t) noexcept
   {
     reset();
     return *this;
@@ -776,7 +776,7 @@ public:
 
   _LIBCUDACXX_TEMPLATE(class _Up = value_type)
   _LIBCUDACXX_REQUIRES(__opt_is_assignable_from_U<_Tp, _Up> _LIBCUDACXX_AND __opt_is_assignable<_Tp, _Up>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional& operator=(_Up&& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional& operator=(_Up&& __v)
   {
     if (this->has_value())
     {
@@ -791,7 +791,7 @@ public:
 
   _LIBCUDACXX_TEMPLATE(class _Up)
   _LIBCUDACXX_REQUIRES(__opt_is_assignable_from_opt<_Tp, _Up> _LIBCUDACXX_AND __opt_is_assignable<_Tp, const _Up&>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional& operator=(const optional<_Up>& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional& operator=(const optional<_Up>& __v)
   {
     this->__assign_from(__v);
     return *this;
@@ -799,14 +799,14 @@ public:
 
   _LIBCUDACXX_TEMPLATE(class _Up)
   _LIBCUDACXX_REQUIRES(__opt_is_assignable_from_opt<_Tp, _Up> _LIBCUDACXX_AND __opt_is_assignable<_Tp, _Up>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional& operator=(optional<_Up>&& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional& operator=(optional<_Up>&& __v)
   {
     this->__assign_from(_CUDA_VSTD::move(__v));
     return *this;
   }
 
   template <class... _Args, __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, _Args...), int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp& emplace(_Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& emplace(_Args&&... __args)
   {
     reset();
     this->__construct(_CUDA_VSTD::forward<_Args>(__args)...);
@@ -816,14 +816,14 @@ public:
   template <class _Up,
             class... _Args,
             __enable_if_t<_CCCL_TRAIT(is_constructible, value_type, initializer_list<_Up>&, _Args...), int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args)
   {
     reset();
     this->__construct(__il, _CUDA_VSTD::forward<_Args>(__args)...);
     return this->__get();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr void swap(optional& __opt) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(optional& __opt) noexcept(
     _CCCL_TRAIT(is_nothrow_move_constructible, value_type) && _CCCL_TRAIT(is_nothrow_swappable, value_type))
   {
     if (this->has_value() == __opt.has_value())
@@ -849,43 +849,43 @@ public:
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr add_pointer_t<value_type const> operator->() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr add_pointer_t<value_type const> operator->() const
   {
     _LIBCUDACXX_ASSERT(this->has_value(), "optional operator-> called on a disengaged value");
     return _CUDA_VSTD::addressof(this->__get());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr add_pointer_t<value_type> operator->()
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr add_pointer_t<value_type> operator->()
   {
     _LIBCUDACXX_ASSERT(this->has_value(), "optional operator-> called on a disengaged value");
     return _CUDA_VSTD::addressof(this->__get());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr const value_type& operator*() const& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const value_type& operator*() const& noexcept
   {
     _LIBCUDACXX_ASSERT(this->has_value(), "optional operator* called on a disengaged value");
     return this->__get();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr value_type& operator*() & noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr value_type& operator*() & noexcept
   {
     _LIBCUDACXX_ASSERT(this->has_value(), "optional operator* called on a disengaged value");
     return this->__get();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr value_type&& operator*() && noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr value_type&& operator*() && noexcept
   {
     _LIBCUDACXX_ASSERT(this->has_value(), "optional operator* called on a disengaged value");
     return _CUDA_VSTD::move(this->__get());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr const value_type&& operator*() const&& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const value_type&& operator*() const&& noexcept
   {
     _LIBCUDACXX_ASSERT(this->has_value(), "optional operator* called on a disengaged value");
     return _CUDA_VSTD::move(this->__get());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit operator bool() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit operator bool() const noexcept
   {
     return has_value();
   }
@@ -893,8 +893,7 @@ public:
   using __base::__get;
   using __base::has_value;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr value_type const&
-  value() const&
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr value_type const& value() const&
   {
     if (!this->has_value())
     {
@@ -903,7 +902,7 @@ public:
     return this->__get();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr value_type& value() &
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr value_type& value() &
   {
     if (!this->has_value())
     {
@@ -912,7 +911,7 @@ public:
     return this->__get();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr value_type&& value() &&
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr value_type&& value() &&
   {
     if (!this->has_value())
     {
@@ -921,7 +920,7 @@ public:
     return _CUDA_VSTD::move(this->__get());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr value_type const&&
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr value_type const&&
   value() const&&
   {
     if (!this->has_value())
@@ -932,7 +931,7 @@ public:
   }
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr value_type value_or(_Up&& __v) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr value_type value_or(_Up&& __v) const&
   {
     static_assert(_CCCL_TRAIT(is_copy_constructible, value_type),
                   "optional<T>::value_or: T must be copy constructible");
@@ -941,7 +940,7 @@ public:
   }
 
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr value_type value_or(_Up&& __v) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr value_type value_or(_Up&& __v) &&
   {
     static_assert(_CCCL_TRAIT(is_move_constructible, value_type),
                   "optional<T>::value_or: T must be move constructible");
@@ -950,7 +949,7 @@ public:
   }
 
   template <class _Func>
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto and_then(_Func&& __f) &
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto and_then(_Func&& __f) &
   {
     using _Up = invoke_result_t<_Func, value_type&>;
     static_assert(__is_std_optional<remove_cvref_t<_Up>>::value,
@@ -963,7 +962,7 @@ public:
   }
 
   template <class _Func>
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
   and_then(_Func&& __f) const&
   {
     using _Up = invoke_result_t<_Func, const value_type&>;
@@ -977,8 +976,7 @@ public:
   }
 
   template <class _Func>
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
-  and_then(_Func&& __f) &&
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto and_then(_Func&& __f) &&
   {
     using _Up = invoke_result_t<_Func, value_type&&>;
     static_assert(__is_std_optional<remove_cvref_t<_Up>>::value,
@@ -991,7 +989,7 @@ public:
   }
 
   template <class _Func>
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
   and_then(_Func&& __f) const&&
   {
     using _Up = invoke_result_t<_Func, const value_type&&>;
@@ -1005,8 +1003,7 @@ public:
   }
 
   template <class _Func>
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
-  transform(_Func&& __f) &
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto transform(_Func&& __f) &
   {
     using _Up = remove_cv_t<invoke_result_t<_Func, value_type&>>;
     static_assert(!_CCCL_TRAIT(is_array, _Up), "Result of f(value()) should not be an Array");
@@ -1021,7 +1018,7 @@ public:
   }
 
   template <class _Func>
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
   transform(_Func&& __f) const&
   {
     using _Up = remove_cv_t<invoke_result_t<_Func, const value_type&>>;
@@ -1037,8 +1034,7 @@ public:
   }
 
   template <class _Func>
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
-  transform(_Func&& __f) &&
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto transform(_Func&& __f) &&
   {
     using _Up = remove_cv_t<invoke_result_t<_Func, value_type&&>>;
     static_assert(!_CCCL_TRAIT(is_array, _Up), "Result of f(std::move(value())) should not be an Array");
@@ -1056,7 +1052,7 @@ public:
   }
 
   template <class _Func>
-  _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS constexpr auto
   transform(_Func&& __f) const&&
   {
     using _Up = remove_cvref_t<invoke_result_t<_Func, const value_type&&>>;
@@ -1076,7 +1072,7 @@ public:
 
   _LIBCUDACXX_TEMPLATE(class _Func, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(invocable<_Func> _LIBCUDACXX_AND _CCCL_TRAIT(is_copy_constructible, _Tp2))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional or_else(_Func&& __f) const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional or_else(_Func&& __f) const&
   {
     static_assert(_CCCL_TRAIT(is_same, remove_cvref_t<invoke_result_t<_Func>>, optional),
                   "Result of f() should be the same type as this optional");
@@ -1089,7 +1085,7 @@ public:
 
   _LIBCUDACXX_TEMPLATE(class _Func, class _Tp2 = _Tp)
   _LIBCUDACXX_REQUIRES(invocable<_Func> _LIBCUDACXX_AND _CCCL_TRAIT(is_move_constructible, _Tp2))
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr optional or_else(_Func&& __f) &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr optional or_else(_Func&& __f) &&
   {
     static_assert(_CCCL_TRAIT(is_same, remove_cvref_t<invoke_result_t<_Func>>, optional),
                   "Result of f() should be the same type as this optional");
@@ -1110,7 +1106,7 @@ _CCCL_HOST_DEVICE optional(_Tp) -> optional<_Tp>;
 
 // Comparisons between optionals
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() == declval<const _Up&>()), bool),
   bool>
 operator==(const optional<_Tp>& __x, const optional<_Up>& __y)
@@ -1127,7 +1123,7 @@ operator==(const optional<_Tp>& __x, const optional<_Up>& __y)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() != declval<const _Up&>()), bool),
   bool>
 operator!=(const optional<_Tp>& __x, const optional<_Up>& __y)
@@ -1144,7 +1140,7 @@ operator!=(const optional<_Tp>& __x, const optional<_Up>& __y)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() < declval<const _Up&>()), bool),
   bool>
 operator<(const optional<_Tp>& __x, const optional<_Up>& __y)
@@ -1161,7 +1157,7 @@ operator<(const optional<_Tp>& __x, const optional<_Up>& __y)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() > declval<const _Up&>()), bool),
   bool>
 operator>(const optional<_Tp>& __x, const optional<_Up>& __y)
@@ -1178,7 +1174,7 @@ operator>(const optional<_Tp>& __x, const optional<_Up>& __y)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() <= declval<const _Up&>()), bool),
   bool>
 operator<=(const optional<_Tp>& __x, const optional<_Up>& __y)
@@ -1195,7 +1191,7 @@ operator<=(const optional<_Tp>& __x, const optional<_Up>& __y)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() >= declval<const _Up&>()), bool),
   bool>
 operator>=(const optional<_Tp>& __x, const optional<_Up>& __y)
@@ -1213,80 +1209,80 @@ operator>=(const optional<_Tp>& __x, const optional<_Up>& __y)
 
 // Comparisons with nullopt
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator==(const optional<_Tp>& __x, nullopt_t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, nullopt_t) noexcept
 {
   return !static_cast<bool>(__x);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator==(nullopt_t, const optional<_Tp>& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(nullopt_t, const optional<_Tp>& __x) noexcept
 {
   return !static_cast<bool>(__x);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator!=(const optional<_Tp>& __x, nullopt_t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const optional<_Tp>& __x, nullopt_t) noexcept
 {
   return static_cast<bool>(__x);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator!=(nullopt_t, const optional<_Tp>& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(nullopt_t, const optional<_Tp>& __x) noexcept
 {
   return static_cast<bool>(__x);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<(const optional<_Tp>&, nullopt_t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const optional<_Tp>&, nullopt_t) noexcept
 {
   return false;
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<(nullopt_t, const optional<_Tp>& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(nullopt_t, const optional<_Tp>& __x) noexcept
 {
   return static_cast<bool>(__x);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<=(const optional<_Tp>& __x, nullopt_t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const optional<_Tp>& __x, nullopt_t) noexcept
 {
   return !static_cast<bool>(__x);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<=(nullopt_t, const optional<_Tp>&) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(nullopt_t, const optional<_Tp>&) noexcept
 {
   return true;
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>(const optional<_Tp>& __x, nullopt_t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const optional<_Tp>& __x, nullopt_t) noexcept
 {
   return static_cast<bool>(__x);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>(nullopt_t, const optional<_Tp>&) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(nullopt_t, const optional<_Tp>&) noexcept
 {
   return false;
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>=(const optional<_Tp>&, nullopt_t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const optional<_Tp>&, nullopt_t) noexcept
 {
   return true;
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>=(nullopt_t, const optional<_Tp>& __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(nullopt_t, const optional<_Tp>& __x) noexcept
 {
   return !static_cast<bool>(__x);
 }
 
 // Comparisons with T
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() == declval<const _Up&>()), bool),
   bool>
 operator==(const optional<_Tp>& __x, const _Up& __v)
@@ -1295,7 +1291,7 @@ operator==(const optional<_Tp>& __x, const _Up& __v)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() == declval<const _Up&>()), bool),
   bool>
 operator==(const _Tp& __v, const optional<_Up>& __x)
@@ -1304,7 +1300,7 @@ operator==(const _Tp& __v, const optional<_Up>& __x)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() != declval<const _Up&>()), bool),
   bool>
 operator!=(const optional<_Tp>& __x, const _Up& __v)
@@ -1313,7 +1309,7 @@ operator!=(const optional<_Tp>& __x, const _Up& __v)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() != declval<const _Up&>()), bool),
   bool>
 operator!=(const _Tp& __v, const optional<_Up>& __x)
@@ -1322,7 +1318,7 @@ operator!=(const _Tp& __v, const optional<_Up>& __x)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() < declval<const _Up&>()), bool),
   bool>
 operator<(const optional<_Tp>& __x, const _Up& __v)
@@ -1331,7 +1327,7 @@ operator<(const optional<_Tp>& __x, const _Up& __v)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() < declval<const _Up&>()), bool),
   bool>
 operator<(const _Tp& __v, const optional<_Up>& __x)
@@ -1340,7 +1336,7 @@ operator<(const _Tp& __v, const optional<_Up>& __x)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() <= declval<const _Up&>()), bool),
   bool>
 operator<=(const optional<_Tp>& __x, const _Up& __v)
@@ -1349,7 +1345,7 @@ operator<=(const optional<_Tp>& __x, const _Up& __v)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() <= declval<const _Up&>()), bool),
   bool>
 operator<=(const _Tp& __v, const optional<_Up>& __x)
@@ -1358,7 +1354,7 @@ operator<=(const _Tp& __v, const optional<_Up>& __x)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() > declval<const _Up&>()), bool),
   bool>
 operator>(const optional<_Tp>& __x, const _Up& __v)
@@ -1367,7 +1363,7 @@ operator>(const optional<_Tp>& __x, const _Up& __v)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() > declval<const _Up&>()), bool),
   bool>
 operator>(const _Tp& __v, const optional<_Up>& __x)
@@ -1376,7 +1372,7 @@ operator>(const _Tp& __v, const optional<_Up>& __x)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() >= declval<const _Up&>()), bool),
   bool>
 operator>=(const optional<_Tp>& __x, const _Up& __v)
@@ -1385,7 +1381,7 @@ operator>=(const optional<_Tp>& __x, const _Up& __v)
 }
 
 template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() >= declval<const _Up&>()), bool),
   bool>
 operator>=(const _Tp& __v, const optional<_Up>& __x)
@@ -1394,7 +1390,7 @@ operator>=(const _Tp& __v, const optional<_Up>& __x)
 }
 
 template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr __enable_if_t<
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __enable_if_t<
   _CCCL_TRAIT(is_move_constructible, _Tp) && _CCCL_TRAIT(is_swappable, _Tp),
   void>
 swap(optional<_Tp>& __x, optional<_Tp>& __y) noexcept(noexcept(__x.swap(__y)))
@@ -1403,33 +1399,33 @@ swap(optional<_Tp>& __x, optional<_Tp>& __y) noexcept(noexcept(__x.swap(__y)))
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr optional<decay_t<_Tp>> make_optional(_Tp&& __v)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr optional<decay_t<_Tp>> make_optional(_Tp&& __v)
 {
   return optional<decay_t<_Tp>>(_CUDA_VSTD::forward<_Tp>(__v));
 }
 
 template <class _Tp, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr optional<_Tp> make_optional(_Args&&... __args)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr optional<_Tp> make_optional(_Args&&... __args)
 {
   return optional<_Tp>(in_place, _CUDA_VSTD::forward<_Args>(__args)...);
 }
 
 template <class _Tp, class _Up, class... _Args>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr optional<_Tp> make_optional(initializer_list<_Up> __il, _Args&&... __args)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr optional<_Tp> make_optional(initializer_list<_Up> __il, _Args&&... __args)
 {
   return optional<_Tp>(in_place, __il, _CUDA_VSTD::forward<_Args>(__args)...);
 }
 
 #  ifndef __cuda_std__
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<__enable_hash_helper<optional<_Tp>, remove_const_t<_Tp>>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<__enable_hash_helper<optional<_Tp>, remove_const_t<_Tp>>>
 {
 #    if _CCCL_STD_VER <= 2017 || defined(_LIBCUDACXX_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS)
   _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef optional<_Tp> argument_type;
   _LIBCUDACXX_DEPRECATED_IN_CXX17 typedef size_t result_type;
 #    endif
 
-  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(const optional<_Tp>& __opt) const
+  _LIBCUDACXX_HIDE_FROM_ABI size_t operator()(const optional<_Tp>& __opt) const
   {
     return static_cast<bool>(__opt) ? hash<remove_const_t<_Tp>>()(*__opt) : 0;
   }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/ranges b/libcudacxx/include/cuda/std/detail/libcxx/include/ranges
deleted file mode 100644
index 30174b96dc..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/ranges
+++ /dev/null
@@ -1,354 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023-24 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_RANGES
-#define _LIBCUDACXX_RANGES
-
-/*
-
-#include <compare>              // see [compare.syn]
-#include <initializer_list>     // see [initializer.list.syn]
-#include <iterator>             // see [iterator.synopsis]
-
-namespace std::ranges {
-  inline namespace unspecified {
-    // [range.access], range access
-    inline constexpr unspecified begin = unspecified;
-    inline constexpr unspecified end = unspecified;
-    inline constexpr unspecified cbegin = unspecified;
-    inline constexpr unspecified cend = unspecified;
-
-    inline constexpr unspecified size = unspecified;
-    inline constexpr unspecified ssize = unspecified;
-  }
-
-  // [range.range], ranges
-  template<class T>
-    concept range = see below;
-
-  template<class T>
-    inline constexpr bool enable_borrowed_range = false;
-
-  template<class T>
-    using iterator_t = decltype(ranges::begin(declval<T&>()));
-  template<range R>
-    using sentinel_t = decltype(ranges::end(declval<R&>()));
-  template<range R>
-    using range_difference_t = iter_difference_t<iterator_t<R>>;
-  template<sized_range R>
-    using range_size_t = decltype(ranges::size(declval<R&>()));
-  template<range R>
-    using range_value_t = iter_value_t<iterator_t<R>>;
-  template<range R>
-    using range_reference_t = iter_reference_t<iterator_t<R>>;
-  template<range R>
-    using range_rvalue_reference_t = iter_rvalue_reference_t<iterator_t<R>>;
-
-  // [range.sized], sized ranges
-  template<class>
-    inline constexpr bool disable_sized_range = false;
-
-  template<class T>
-    concept sized_range = ...;
-
-  // [range.view], views
-  template<class T>
-    inline constexpr bool enable_view = ...;
-
-  struct view_base { };
-
-  template<class T>
-    concept view = ...;
-
-  // [range.refinements], other range refinements
-  template<class R, class T>
-    concept output_range = see below;
-
-  template<class T>
-    concept input_range = see below;
-
-  template<class T>
-    concept forward_range = see below;
-
-  template<class T>
-  concept bidirectional_range = see below;
-
-  template<class T>
-  concept random_access_range = see below;
-
-  template<class T>
-  concept contiguous_range = see below;
-
-  template <class _Tp>
-    concept common_range = see below;
-
-  template<class T>
-  concept viewable_range = see below;
-
-  // [view.interface], class template view_interface
-  template<class D>
-    requires is_class_v<D> && same_as<D, remove_cv_t<D>>
-  class view_interface;
-
-  // [range.subrange], sub-ranges
-  enum class subrange_kind : bool { unsized, sized };
-
-  template<input_or_output_iterator I, sentinel_for<I> S = I, subrange_kind K = see below>
-    requires (K == subrange_kind::sized || !sized_sentinel_for<S, I>)
-  class subrange;
-
-  template<class I, class S, subrange_kind K>
-    inline constexpr bool enable_borrowed_range<subrange<I, S, K>> = true;
-
-  // [range.dangling], dangling iterator handling
-  struct dangling;
-
-  template<range R>
-    using borrowed_iterator_t = see below;
-
-  template<range R>
-    using borrowed_subrange_t = see below;
-
-  // [range.empty], empty view
-  template<class T>
-    requires is_object_v<T>
-  class empty_view;
-
-  template<class T>
-    inline constexpr bool enable_borrowed_range<empty_view<T>> = true;
-
-  namespace views {
-    template<class T>
-      inline constexpr empty_view<T> empty{};
-  }
-
-  // [range.all], all view
-  namespace views {
-    inline constexpr unspecified all = unspecified;
-
-    template<viewable_range R>
-      using all_t = decltype(all(declval<R>()));
-  }
-
-  template<range R>
-    requires is_object_v<R>
-  class ref_view;
-
-  template<class T>
-    inline constexpr bool enable_borrowed_range<ref_view<T>> = true;
-
-  template<range R>
-    requires see below
-  class owning_view;
-
-  template<class T>
-    inline constexpr bool enable_borrowed_range<owning_view<T>> = enable_borrowed_range<T>;
-
-  // [range.filter], filter view
-  template<input_range V, indirect_unary_predicate<iterator_t<V>> Pred>
-    requires view<V> && is_object_v<Pred>
-  class filter_view;
-
-  namespace views {
-    inline constexpr unspecified filter = unspecified;
-  }
-
-  // [range.drop], drop view
-  template<view V>
-    class drop_view;
-
-  template<class T>
-    inline constexpr bool enable_borrowed_range<drop_view<T>> = enable_borrowed_range<T>;
-
-  // [range.drop.while], drop while view
-  template<view V, class Pred>
-    requires input_range<V> && is_object_v<Pred> &&
-             indirect_unary_predicate<const Pred, iterator_t<V>>
-    class drop_while_view;
-
-  template<class T, class Pred>
-    inline constexpr bool enable_borrowed_range<drop_while_view<T, Pred>> =
-      enable_borrowed_range<T>;
-
-  namespace views { inline constexpr unspecified drop_while = unspecified; }
-
-  // [range.transform], transform view
-  template<input_range V, copy_constructible F>
-    requires view<V> && is_object_v<F> &&
-             regular_invocable<F&, range_reference_t<V>> &&
-             can-reference<invoke_result_t<F&, range_reference_t<V>>>
-  class transform_view;
-
-  // [range.counted], counted view
-  namespace views { inline constexpr unspecified counted = unspecified; }
-
-  // [range.common], common view
-  template<view V>
-    requires (!common_range<V> && copyable<iterator_t<V>>)
-  class common_view;
-
- // [range.reverse], reverse view
-  template<view V>
-    requires bidirectional_range<V>
-  class reverse_view;
-
-  template<class T>
-    inline constexpr bool enable_borrowed_range<reverse_view<T>> = enable_borrowed_range<T>;
-
-  template<class T>
-  inline constexpr bool enable_borrowed_range<common_view<T>> = enable_borrowed_range<T>;
-
-   // [range.take], take view
-  template<view> class take_view;
-
-  template<class T>
-  inline constexpr bool enable_borrowed_range<take_view<T>> = enable_borrowed_range<T>;
-
-    // [range.take.while], take while view
-  template<view V, class Pred>
-    requires input_range<V> && is_object_v<Pred> &&
-             indirect_unary_predicate<const Pred, iterator_t<V>>
-    class take_while_view;
-
-  namespace views { inline constexpr unspecified take_while = unspecified; }
-
-  template<copy_constructible T>
-    requires is_object_v<T>
-  class single_view;
-
-  template<weakly_incrementable W, semiregular Bound = unreachable_sentinel_t>
-    requires weakly-equality-comparable-with<W, Bound> && copyable<W>
-  class iota_view;
-
-  template<class W, class Bound>
-    inline constexpr bool enable_borrowed_range<iota_view<W, Bound>> = true;
-
-  // [range.join], join view
-  template<input_range V>
-    requires view<V> && input_range<range_reference_t<V>>
-  class join_view;
-
-  // [range.lazy.split], lazy split view
-  template<class R>
-    concept tiny-range = see below;   // exposition only
-
-  template<input_range V, forward_range Pattern>
-    requires view<V> && view<Pattern> &&
-             indirectly_comparable<iterator_t<V>, iterator_t<Pattern>, ranges::equal_to> &&
-             (forward_range<V> || tiny-range<Pattern>)
-  class lazy_split_view;
-
-  namespace views {
-    inline constexpr unspecified lazy_split = unspecified;
-  }
-
-  // [range.istream], istream view
-  template<movable Val, class CharT, class Traits = char_traits<CharT>>
-    requires see below
-  class basic_istream_view;
-
-  template<class Val>
-    using istream_view = basic_istream_view<Val, char>;
-
-  template<class Val>
-    using wistream_view = basic_istream_view<Val, wchar_t>;
-
-  namespace views { template<class T> inline constexpr unspecified istream = unspecified; }
-
-  // [range.zip], zip view
-  template<input_range... Views>
-    requires (view<Views> && ...) && (sizeof...(Views) > 0)
-  class zip_view;        // C++2b
-
-  template<class... Views>
-    inline constexpr bool enable_borrowed_range<zip_view<Views...>> =    // C++2b
-      (enable_borrowed_range<Views> && ...);
-
-  namespace views { inline constexpr unspecified zip = unspecified; }    // C++2b
-}
-
-namespace std {
-  namespace views = ranges::views;
-
-  template<class T> struct tuple_size;
-  template<size_t I, class T> struct tuple_element;
-
-  template<class I, class S, ranges::subrange_kind K>
-  struct tuple_size<ranges::subrange<I, S, K>>
-    : integral_constant<size_t, 2> {};
-
-  template<class I, class S, ranges::subrange_kind K>
-  struct tuple_element<0, ranges::subrange<I, S, K>> {
-    using type = I;
-  };
-
-  template<class I, class S, ranges::subrange_kind K>
-  struct tuple_element<1, ranges::subrange<I, S, K>> {
-    using type = S;
-  };
-
-  template<class I, class S, ranges::subrange_kind K>
-  struct tuple_element<0, const ranges::subrange<I, S, K>> {
-    using type = I;
-  };
-
-  template<class I, class S, ranges::subrange_kind K>
-  struct tuple_element<1, const ranges::subrange<I, S, K>> {
-    using type = S;
-  };
-}
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-// MSVC complains about [[msvc::no_unique_address]] prior to C++20 as a vendor extension
-_CCCL_DIAG_PUSH
-_CCCL_DIAG_SUPPRESS_MSVC(4848)
-
-#include <cuda/std/__ranges/access.h>
-#include <cuda/std/__ranges/concepts.h>
-#include <cuda/std/__ranges/dangling.h>
-#include <cuda/std/__ranges/data.h>
-#include <cuda/std/__ranges/empty.h>
-#include <cuda/std/__ranges/enable_borrowed_range.h>
-#include <cuda/std/__ranges/enable_view.h>
-#include <cuda/std/__ranges/rbegin.h>
-#include <cuda/std/__ranges/rend.h>
-#include <cuda/std/__ranges/size.h>
-#include <cuda/std/__ranges/subrange.h>
-#include <cuda/std/__ranges/view_interface.h>
-#include <cuda/std/__ranges/views.h>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-
-// standard-mandated includes
-#include <cuda/std/version>
-
-// [ranges.syn]
-#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-#  include <cuda/std/compare>
-#endif // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-#include <cuda/std/initializer_list>
-#include <cuda/std/iterator>
-
-// [tuple.helper]
-#include <cuda/std/__tuple_dir/tuple_element.h>
-#include <cuda/std/__tuple_dir/tuple_size.h>
-
-_CCCL_DIAG_POP
-
-#endif // _LIBCUDACXX_RANGES
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/ratio b/libcudacxx/include/cuda/std/detail/libcxx/include/ratio
index 68f28feda3..e59dbd5826 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/ratio
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/ratio
@@ -256,7 +256,7 @@ public:
 };
 
 template <intmax_t _Num, intmax_t _Den = 1>
-class _LIBCUDACXX_TEMPLATE_VIS ratio
+class _CCCL_TYPE_VISIBILITY_DEFAULT ratio
 {
   static_assert(__static_abs<_Num>::value >= 0, "ratio numerator is out of range");
   static_assert(_Den != 0, "ratio divide by 0");
@@ -372,11 +372,11 @@ using ratio_subtract = typename __ratio_subtract<_R1, _R2>::type;
 // ratio_equal
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_equal : public bool_constant<(_R1::num == _R2::num && _R1::den == _R2::den)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT ratio_equal : public bool_constant<(_R1::num == _R2::num && _R1::den == _R2::den)>
 {};
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_not_equal : public bool_constant<(!ratio_equal<_R1, _R2>::value)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT ratio_not_equal : public bool_constant<(!ratio_equal<_R1, _R2>::value)>
 {};
 
 // ratio_less
@@ -439,19 +439,19 @@ struct __ratio_less<_R1, _R2, -1LL, -1LL>
 };
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_less : public bool_constant<(__ratio_less<_R1, _R2>::value)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT ratio_less : public bool_constant<(__ratio_less<_R1, _R2>::value)>
 {};
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_less_equal : public bool_constant<(!ratio_less<_R2, _R1>::value)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT ratio_less_equal : public bool_constant<(!ratio_less<_R2, _R1>::value)>
 {};
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_greater : public bool_constant<(ratio_less<_R2, _R1>::value)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT ratio_greater : public bool_constant<(ratio_less<_R2, _R1>::value)>
 {};
 
 template <class _R1, class _R2>
-struct _LIBCUDACXX_TEMPLATE_VIS ratio_greater_equal : public bool_constant<(!ratio_less<_R1, _R2>::value)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT ratio_greater_equal : public bool_constant<(!ratio_less<_R1, _R2>::value)>
 {};
 
 template <class _R1, class _R2>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore b/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
index 74b421d903..581fcd9b0a 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
@@ -71,7 +71,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <thread_scope _Sco, ptrdiff_t __least_max_value>
 class __atomic_semaphore_base
 {
-  _LIBCUDACXX_INLINE_VISIBILITY bool __fetch_sub_if_slow(ptrdiff_t __old)
+  _LIBCUDACXX_HIDE_FROM_ABI bool __fetch_sub_if_slow(ptrdiff_t __old)
   {
     while (__old != 0)
     {
@@ -83,7 +83,7 @@ class __atomic_semaphore_base
     return false;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool __fetch_sub_if()
+  _LIBCUDACXX_HIDE_FROM_ABI bool __fetch_sub_if()
   {
     ptrdiff_t __old = __count.load(memory_order_acquire);
     if (__old == 0)
@@ -97,7 +97,7 @@ class __atomic_semaphore_base
     return __fetch_sub_if_slow(__old); // fail only if not __available
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void __wait_slow()
+  _LIBCUDACXX_HIDE_FROM_ABI void __wait_slow()
   {
     while (1)
     {
@@ -110,7 +110,7 @@ class __atomic_semaphore_base
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
+  _LIBCUDACXX_HIDE_FROM_ABI bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
   {
     return __libcpp_thread_poll_with_backoff(
       [this]() {
@@ -122,21 +122,21 @@ class __atomic_semaphore_base
   __atomic_impl<ptrdiff_t, _Sco> __count;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
   {
     return numeric_limits<ptrdiff_t>::max();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_semaphore_base(ptrdiff_t __count) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_semaphore_base(ptrdiff_t __count) noexcept
       : __count(__count)
   {}
 
-  ~__atomic_semaphore_base() = default;
+  _CCCL_HIDE_FROM_ABI ~__atomic_semaphore_base() = default;
 
   __atomic_semaphore_base(__atomic_semaphore_base const&)            = delete;
   __atomic_semaphore_base& operator=(__atomic_semaphore_base const&) = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY void release(ptrdiff_t __update = 1)
+  _LIBCUDACXX_HIDE_FROM_ABI void release(ptrdiff_t __update = 1)
   {
     __count.fetch_add(__update, memory_order_release);
     if (__update > 1)
@@ -149,7 +149,7 @@ public:
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void acquire()
+  _LIBCUDACXX_HIDE_FROM_ABI void acquire()
   {
     while (!try_acquire())
     {
@@ -157,13 +157,13 @@ public:
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI bool try_acquire() noexcept
   {
     return __fetch_sub_if();
   }
 
   template <class Clock, class Duration>
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
+  _LIBCUDACXX_HIDE_FROM_ABI bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
   {
     if (try_acquire())
     {
@@ -176,7 +176,7 @@ public:
   }
 
   template <class Rep, class Period>
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
+  _LIBCUDACXX_HIDE_FROM_ABI bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
   {
     if (try_acquire())
     {
@@ -194,7 +194,7 @@ public:
 template <thread_scope _Sco>
 class __atomic_semaphore_base<_Sco, 1>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
+  _LIBCUDACXX_HIDE_FROM_ABI bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
   {
     return __libcpp_thread_poll_with_backoff(
       [this]() {
@@ -205,21 +205,21 @@ class __atomic_semaphore_base<_Sco, 1>
   __atomic_impl<int, _Sco> __available;
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr ptrdiff_t max() noexcept
   {
     return 1;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_semaphore_base(ptrdiff_t __available)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __atomic_semaphore_base(ptrdiff_t __available)
       : __available(__available)
   {}
 
-  ~__atomic_semaphore_base() = default;
+  _CCCL_HIDE_FROM_ABI ~__atomic_semaphore_base() = default;
 
   __atomic_semaphore_base(__atomic_semaphore_base const&)            = delete;
   __atomic_semaphore_base& operator=(__atomic_semaphore_base const&) = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY void release(ptrdiff_t __update = 1)
+  _LIBCUDACXX_HIDE_FROM_ABI void release(ptrdiff_t __update = 1)
   {
     _LIBCUDACXX_ASSERT(__update == 1, "");
     __available.store(1, memory_order_release);
@@ -227,7 +227,7 @@ public:
     (void) __update;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void acquire()
+  _LIBCUDACXX_HIDE_FROM_ABI void acquire()
   {
     while (!try_acquire())
     {
@@ -235,13 +235,13 @@ public:
     }
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI bool try_acquire() noexcept
   {
     return 1 == __available.exchange(0, memory_order_acquire);
   }
 
   template <class Clock, class Duration>
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
+  _LIBCUDACXX_HIDE_FROM_ABI bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
   {
     if (try_acquire())
     {
@@ -254,7 +254,7 @@ public:
   }
 
   template <class Rep, class Period>
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
+  _LIBCUDACXX_HIDE_FROM_ABI bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
   {
     if (try_acquire())
     {
@@ -272,7 +272,7 @@ public:
 template <thread_scope _Sco>
 class __sem_semaphore_base
 {
-  _LIBCUDACXX_INLINE_VISIBILITY bool __backfill(bool __success)
+  _LIBCUDACXX_HIDE_FROM_ABI bool __backfill(bool __success)
   {
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
     if (__success)
@@ -293,7 +293,7 @@ class __sem_semaphore_base
   return __success;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY bool
+_LIBCUDACXX_HIDE_FROM_ABI bool
 __try_acquire_fast()
 {
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
@@ -334,7 +334,7 @@ __try_acquire_fast()
   return false;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY bool __try_done(bool __success)
+_LIBCUDACXX_HIDE_FROM_ABI bool __try_done(bool __success)
 {
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
   // record we're NOT waiting
@@ -343,7 +343,7 @@ _LIBCUDACXX_INLINE_VISIBILITY bool __try_done(bool __success)
   return __backfill(__success);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY void __release_slow(ptrdiff_t __post_amount)
+_LIBCUDACXX_HIDE_FROM_ABI void __release_slow(ptrdiff_t __post_amount)
 {
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
   bool const __post_one = __post_amount > 0;
@@ -378,7 +378,7 @@ static constexpr ptrdiff_t max() noexcept
   return _LIBCUDACXX_SEMAPHORE_MAX;
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY __sem_semaphore_base(ptrdiff_t __count = 0)
+_LIBCUDACXX_HIDE_FROM_ABI __sem_semaphore_base(ptrdiff_t __count = 0)
     : __semaphore()
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
     , __frontbuffer(__count << 32)
@@ -397,7 +397,7 @@ _LIBCUDACXX_INLINE_VISIBILITY __sem_semaphore_base(ptrdiff_t __count = 0)
   _LIBCUDACXX_ASSERT(__success, "");
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY ~__sem_semaphore_base()
+_LIBCUDACXX_HIDE_FROM_ABI ~__sem_semaphore_base()
 {
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
   _LIBCUDACXX_ASSERT(0 == (__frontbuffer.load(memory_order_relaxed) & ~0u), "");
@@ -409,7 +409,7 @@ _LIBCUDACXX_INLINE_VISIBILITY ~__sem_semaphore_base()
 __sem_semaphore_base(const __sem_semaphore_base&)            = delete;
 __sem_semaphore_base& operator=(const __sem_semaphore_base&) = delete;
 
-_LIBCUDACXX_INLINE_VISIBILITY void release(ptrdiff_t __update = 1)
+_LIBCUDACXX_HIDE_FROM_ABI void release(ptrdiff_t __update = 1)
 {
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
   // boldly assume the semaphore is taken but uncontended
@@ -427,7 +427,7 @@ _LIBCUDACXX_INLINE_VISIBILITY void release(ptrdiff_t __update = 1)
   __release_slow(__update);
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY void acquire()
+_LIBCUDACXX_HIDE_FROM_ABI void acquire()
 {
   if (!__try_acquire_fast())
   {
@@ -435,20 +435,20 @@ _LIBCUDACXX_INLINE_VISIBILITY void acquire()
   }
 }
 
-_LIBCUDACXX_INLINE_VISIBILITY bool try_acquire() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI bool try_acquire() noexcept
 {
   return try_acquire_for(chrono::nanoseconds(0));
 }
 
 template <class Clock, class Duration>
-_LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
+_LIBCUDACXX_HIDE_FROM_ABI bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
 {
   auto const current = max(Clock::now(), __abs_time);
   return try_acquire_for(chrono::duration_cast<chrono::nanoseconds>(__abs_time - current));
 }
 
 template <class Rep, class Period>
-_LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
+_LIBCUDACXX_HIDE_FROM_ABI bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
 {
   return __try_acquire_fast() || __try_done(__libcpp_semaphore_wait_timed(&__semaphore, __rel_time));
 }
@@ -474,10 +474,10 @@ class counting_semaphore : public __semaphore_base<__least_max_value, thread_sco
   static_assert(__least_max_value <= __semaphore_base<__least_max_value, thread_scope_system>::max(), "");
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr counting_semaphore(ptrdiff_t __count = 0)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr counting_semaphore(ptrdiff_t __count = 0)
       : __semaphore_base<__least_max_value, thread_scope_system>(__count)
   {}
-  ~counting_semaphore() = default;
+  _CCCL_HIDE_FROM_ABI ~counting_semaphore() = default;
 
   counting_semaphore(const counting_semaphore&)            = delete;
   counting_semaphore& operator=(const counting_semaphore&) = delete;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span
index 9601eb3d6d..12479eb37b 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/span
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span
@@ -302,7 +302,7 @@ template <class _Tp>
 _LIBCUDACXX_INLINE_VAR constexpr size_t __maybe_static_ext<_Tp, true> = {_Tp::value};
 
 template <typename _Tp, size_t _Extent>
-class _LIBCUDACXX_TEMPLATE_VIS span
+class _CCCL_TYPE_VISIBILITY_DEFAULT span
 {
 public:
   //  constants and types
@@ -321,17 +321,17 @@ public:
 
   // [span.cons], span constructors, copy, assignment, and destructor
   template <size_t _Sz = _Extent, enable_if_t<_Sz == 0, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span() noexcept
       : __data_{nullptr}
   {}
 
-  span(const span&) noexcept            = default;
-  span& operator=(const span&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI span(const span&) noexcept            = default;
+  _CCCL_HIDE_FROM_ABI span& operator=(const span&) noexcept = default;
 
 #  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
   _LIBCUDACXX_TEMPLATE(class _It)
   _LIBCUDACXX_REQUIRES(__span_compatible_iterator<_It, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit span(_It __first, size_type __count)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit span(_It __first, size_type __count)
       : __data_{_CUDA_VSTD::to_address(__first)}
   {
     (void) __count;
@@ -341,7 +341,7 @@ public:
   _LIBCUDACXX_TEMPLATE(class _It, class _End)
   _LIBCUDACXX_REQUIRES(
     __span_compatible_iterator<_It, element_type> _LIBCUDACXX_AND __span_compatible_sentinel_for<_End, _It>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit span(_It __first, _End __last)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit span(_It __first, _End __last)
       : __data_{_CUDA_VSTD::to_address(__first)}
   {
     (void) __last;
@@ -350,13 +350,13 @@ public:
                        "invalid range in span's constructor (iterator, sentinel): last - first != extent");
   }
 #  else // ^^^ C++17 ^^^ / vvv C++14 vvv
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(pointer __ptr, size_type __count)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(pointer __ptr, size_type __count)
       : __data_{__ptr}
   {
     (void) __count;
     _LIBCUDACXX_ASSERT(_Extent == __count, "size mismatch in span's constructor (ptr, len)");
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(pointer __f, pointer __l)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(pointer __f, pointer __l)
       : __data_{__f}
   {
     (void) __l;
@@ -366,31 +366,31 @@ public:
 
 #  if defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_COMPILER_MSVC_2017)
   template <size_t _Sz = _Extent, enable_if_t<_Sz != 0, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(type_identity_t<element_type> (&__arr)[_Sz]) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(type_identity_t<element_type> (&__arr)[_Sz]) noexcept
       : __data_{__arr}
   {}
 #  else
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(type_identity_t<element_type> (&__arr)[_Extent]) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(type_identity_t<element_type> (&__arr)[_Extent]) noexcept
       : __data_{__arr}
   {}
 #  endif // !_CCCL_COMPILER_NVRTC && !_CCCL_COMPILER_MSVC_2017
 
   _LIBCUDACXX_TEMPLATE(class _OtherElementType)
   _LIBCUDACXX_REQUIRES(__span_array_convertible<_OtherElementType, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(array<_OtherElementType, _Extent>& __arr) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(array<_OtherElementType, _Extent>& __arr) noexcept
       : __data_{__arr.data()}
   {}
 
   _LIBCUDACXX_TEMPLATE(class _OtherElementType)
   _LIBCUDACXX_REQUIRES(__span_array_convertible<const _OtherElementType, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(const array<_OtherElementType, _Extent>& __arr) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(const array<_OtherElementType, _Extent>& __arr) noexcept
       : __data_{__arr.data()}
   {}
 
 #  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
   _LIBCUDACXX_TEMPLATE(class _Range)
   _LIBCUDACXX_REQUIRES(__span_compatible_range<_Range, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit span(_Range&& __r)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit span(_Range&& __r)
       : __data_{_CUDA_VRANGES::data(__r)}
   {
     _LIBCUDACXX_ASSERT(_CUDA_VRANGES::size(__r) == _Extent, "size mismatch in span's constructor (range)");
@@ -398,7 +398,7 @@ public:
 #  else // ^^^ C++17 ^^^ / vvv C++14 vvv
   _LIBCUDACXX_TEMPLATE(class _Container)
   _LIBCUDACXX_REQUIRES(__is_span_compatible_container<_Container, _Tp>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(_Container& __c) noexcept(noexcept(_CUDA_VSTD::data(__c)))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(_Container& __c) noexcept(noexcept(_CUDA_VSTD::data(__c)))
       : __data_{_CUDA_VSTD::data(__c)}
   {
     _LIBCUDACXX_ASSERT(_Extent == _CUDA::VSTD::size(__c), "size mismatch in span's constructor (other span)");
@@ -406,7 +406,7 @@ public:
 
   _LIBCUDACXX_TEMPLATE(class _Container)
   _LIBCUDACXX_REQUIRES(__is_span_compatible_container<_Container, _Tp>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(const _Container& __c) noexcept(noexcept(_CUDA_VSTD::data(__c)))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(const _Container& __c) noexcept(noexcept(_CUDA_VSTD::data(__c)))
       : __data_{_CUDA_VSTD::data(__c)}
   {
     _LIBCUDACXX_ASSERT(_Extent == _CUDA::VSTD::size(__c), "size mismatch in span's constructor (other span)");
@@ -416,13 +416,13 @@ public:
   _LIBCUDACXX_TEMPLATE(class _OtherElementType, size_t _Extent2 = _Extent)
   _LIBCUDACXX_REQUIRES((_Extent2 != dynamic_extent)
                          _LIBCUDACXX_AND __span_array_convertible<_OtherElementType, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(const span<_OtherElementType, _Extent2>& __other) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(const span<_OtherElementType, _Extent2>& __other) noexcept
       : __data_{__other.data()}
   {}
 
   _LIBCUDACXX_TEMPLATE(class _OtherElementType)
   _LIBCUDACXX_REQUIRES(__span_array_convertible<_OtherElementType, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit span(const span<_OtherElementType, dynamic_extent>& __other) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit span(const span<_OtherElementType, dynamic_extent>& __other) noexcept
       : __data_{__other.data()}
   {
     _LIBCUDACXX_ASSERT(_Extent == __other.size(), "size mismatch in span's constructor (other span)");
@@ -431,26 +431,26 @@ public:
   //  ~span() noexcept = default;
 
   template <size_t _Count>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span<element_type, _Count> first() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span<element_type, _Count> first() const noexcept
   {
     static_assert(_Count <= _Extent, "span<T, N>::first<Count>(): Count out of range");
     return span<element_type, _Count>{data(), _Count};
   }
 
   template <size_t _Count>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span<element_type, _Count> last() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span<element_type, _Count> last() const noexcept
   {
     static_assert(_Count <= _Extent, "span<T, N>::last<Count>(): Count out of range");
     return span<element_type, _Count>{data() + size() - _Count, _Count};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span<element_type, dynamic_extent> first(size_type __count) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent> first(size_type __count) const noexcept
   {
     _LIBCUDACXX_ASSERT(__count <= size(), "span<T, N>::first(count): count out of range");
     return {data(), __count};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span<element_type, dynamic_extent> last(size_type __count) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent> last(size_type __count) const noexcept
   {
     _LIBCUDACXX_ASSERT(__count <= size(), "span<T, N>::last(count): count out of range");
     return {data() + size() - __count, __count};
@@ -460,7 +460,7 @@ public:
   using __subspan_t = span<element_type, _Count != dynamic_extent ? _Count : _Extent - _Offset>;
 
   template <size_t _Offset, size_t _Count = dynamic_extent>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __subspan_t<_Offset, _Count> subspan() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __subspan_t<_Offset, _Count> subspan() const noexcept
   {
     static_assert(_Offset <= _Extent, "span<T, N>::subspan<Offset, Count>(): Offset out of range");
     static_assert(_Count == dynamic_extent || _Count <= _Extent - _Offset,
@@ -468,7 +468,7 @@ public:
     return __subspan_t<_Offset, _Count>{data() + _Offset, _Count == dynamic_extent ? size() - _Offset : _Count};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span<element_type, dynamic_extent>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent>
   subspan(size_type __offset, size_type __count = dynamic_extent) const noexcept
   {
     _LIBCUDACXX_ASSERT(__offset <= size(), "span<T, N>::subspan(offset, count): offset out of range");
@@ -482,66 +482,66 @@ public:
     return {data() + __offset, __count};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_type size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type size() const noexcept
   {
     return _Extent;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_type size_bytes() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type size_bytes() const noexcept
   {
     return _Extent * sizeof(element_type);
   }
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool empty() const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const noexcept
   {
     return _Extent == 0;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr reference operator[](size_type __idx) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference operator[](size_type __idx) const noexcept
   {
     _LIBCUDACXX_ASSERT(__idx < size(), "span<T, N>::operator[](index): index out of range");
     return __data_[__idx];
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr reference front() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference front() const noexcept
   {
     _LIBCUDACXX_ASSERT(!empty(), "span<T, N>::front() on empty span");
     return __data_[0];
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr reference back() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference back() const noexcept
   {
     _LIBCUDACXX_ASSERT(!empty(), "span<T, N>::back() on empty span");
     return __data_[size() - 1];
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr pointer data() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer data() const noexcept
   {
     return __data_;
   }
 
   // [span.iter], span iterator support
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr iterator begin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator begin() const noexcept
   {
     return iterator(data());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr iterator end() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator end() const noexcept
   {
     return iterator(data() + size());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 reverse_iterator rbegin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 reverse_iterator rbegin() const noexcept
   {
     return reverse_iterator(end());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 reverse_iterator rend() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 reverse_iterator rend() const noexcept
   {
     return reverse_iterator(begin());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY span<const byte, _Extent * sizeof(element_type)> __as_bytes() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI span<const byte, _Extent * sizeof(element_type)> __as_bytes() const noexcept
   {
     return span<const byte, _Extent * sizeof(element_type)>{reinterpret_cast<const byte*>(data()), size_bytes()};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY span<byte, _Extent * sizeof(element_type)> __as_writable_bytes() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI span<byte, _Extent * sizeof(element_type)> __as_writable_bytes() const noexcept
   {
     return span<byte, _Extent * sizeof(element_type)>{reinterpret_cast<byte*>(data()), size_bytes()};
   }
@@ -551,7 +551,7 @@ private:
 };
 
 template <typename _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS span<_Tp, dynamic_extent>
+class _CCCL_TYPE_VISIBILITY_DEFAULT span<_Tp, dynamic_extent>
 {
 public:
   //  constants and types
@@ -569,17 +569,17 @@ public:
   static constexpr size_type extent = dynamic_extent;
 
   // [span.cons], span constructors, copy, assignment, and destructor
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span() noexcept
       : __data_{nullptr}
       , __size_{0}
   {}
 
-  span(const span&) noexcept            = default;
-  span& operator=(const span&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI span(const span&) noexcept            = default;
+  _CCCL_HIDE_FROM_ABI span& operator=(const span&) noexcept = default;
 #  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
   _LIBCUDACXX_TEMPLATE(class _It)
   _LIBCUDACXX_REQUIRES(__span_compatible_iterator<_It, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(_It __first, size_type __count)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(_It __first, size_type __count)
       : __data_{_CUDA_VSTD::to_address(__first)}
       , __size_{__count}
   {}
@@ -587,7 +587,7 @@ public:
   _LIBCUDACXX_TEMPLATE(class _It, class _End)
   _LIBCUDACXX_REQUIRES(
     __span_compatible_iterator<_It, element_type> _LIBCUDACXX_AND __span_compatible_sentinel_for<_End, _It>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(_It __first, _End __last)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(_It __first, _End __last)
       : __data_(_CUDA_VSTD::to_address(__first))
       , __size_(__last - __first)
   {
@@ -595,32 +595,32 @@ public:
   }
 
 #  else // ^^^ C++17 ^^^ / vvv C++14 vvv
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(pointer __ptr, size_type __count)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(pointer __ptr, size_type __count)
       : __data_{__ptr}
       , __size_{__count}
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(pointer __f, pointer __l)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(pointer __f, pointer __l)
       : __data_{__f}
       , __size_{static_cast<size_t>(__l - __f)}
   {}
 #  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017
 
   template <size_t _Sz>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(type_identity_t<element_type> (&__arr)[_Sz]) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(type_identity_t<element_type> (&__arr)[_Sz]) noexcept
       : __data_{__arr}
       , __size_{_Sz}
   {}
 
   _LIBCUDACXX_TEMPLATE(class _OtherElementType, size_t _Sz)
   _LIBCUDACXX_REQUIRES(__span_array_convertible<_OtherElementType, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(array<_OtherElementType, _Sz>& __arr) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(array<_OtherElementType, _Sz>& __arr) noexcept
       : __data_{__arr.data()}
       , __size_{_Sz}
   {}
 
   _LIBCUDACXX_TEMPLATE(class _OtherElementType, size_t _Sz)
   _LIBCUDACXX_REQUIRES(__span_array_convertible<const _OtherElementType, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(const array<_OtherElementType, _Sz>& __arr) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(const array<_OtherElementType, _Sz>& __arr) noexcept
       : __data_{__arr.data()}
       , __size_{_Sz}
   {}
@@ -628,21 +628,21 @@ public:
 #  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
   _LIBCUDACXX_TEMPLATE(class _Range)
   _LIBCUDACXX_REQUIRES(__span_compatible_range<_Range, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(_Range&& __r)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(_Range&& __r)
       : __data_(_CUDA_VRANGES::data(__r))
       , __size_{_CUDA_VRANGES::size(__r)}
   {}
 #  else // ^^^ C++17 ^^^ / vvv C++14 vvv
   _LIBCUDACXX_TEMPLATE(class _Container)
   _LIBCUDACXX_REQUIRES(__is_span_compatible_container<_Container, _Tp>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(_Container& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(_Container& __c)
       : __data_{_CUDA_VSTD::data(__c)}
       , __size_{(size_type) _CUDA_VSTD::size(__c)}
   {}
 
   _LIBCUDACXX_TEMPLATE(class _Container)
   _LIBCUDACXX_REQUIRES(__is_span_compatible_container<_Container, _Tp>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(const _Container& __c)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(const _Container& __c)
       : __data_{_CUDA_VSTD::data(__c)}
       , __size_{(size_type) _CUDA_VSTD::size(__c)}
   {}
@@ -650,7 +650,7 @@ public:
 
   _LIBCUDACXX_TEMPLATE(class _OtherElementType, size_t _OtherExtent)
   _LIBCUDACXX_REQUIRES(__span_array_convertible<_OtherElementType, element_type>)
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span(const span<_OtherElementType, _OtherExtent>& __other) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(const span<_OtherElementType, _OtherExtent>& __other) noexcept
       : __data_{__other.data()}
       , __size_{__other.size()}
   {}
@@ -658,26 +658,26 @@ public:
   //    ~span() noexcept = default;
 
   template <size_t _Count>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span<element_type, _Count> first() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span<element_type, _Count> first() const noexcept
   {
     _LIBCUDACXX_ASSERT(_Count <= size(), "span<T>::first<Count>(): Count out of range");
     return span<element_type, _Count>{data(), _Count};
   }
 
   template <size_t _Count>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span<element_type, _Count> last() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span<element_type, _Count> last() const noexcept
   {
     _LIBCUDACXX_ASSERT(_Count <= size(), "span<T>::last<Count>(): Count out of range");
     return span<element_type, _Count>{data() + size() - _Count, _Count};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span<element_type, dynamic_extent> first(size_type __count) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent> first(size_type __count) const noexcept
   {
     _LIBCUDACXX_ASSERT(__count <= size(), "span<T>::first(count): count out of range");
     return {data(), __count};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr span<element_type, dynamic_extent> last(size_type __count) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent> last(size_type __count) const noexcept
   {
     _LIBCUDACXX_ASSERT(__count <= size(), "span<T>::last(count): count out of range");
     return {data() + size() - __count, __count};
@@ -687,7 +687,7 @@ public:
   using __subspan_t = span<element_type, _Count>;
 
   template <size_t _Offset, size_t _Count = dynamic_extent>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __subspan_t<_Offset, _Count> subspan() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __subspan_t<_Offset, _Count> subspan() const noexcept
   {
     _LIBCUDACXX_ASSERT(_Offset <= size(), "span<T>::subspan<Offset, Count>(): Offset out of range");
     _LIBCUDACXX_ASSERT(_Count == dynamic_extent || _Count <= size() - _Offset,
@@ -695,7 +695,7 @@ public:
     return __subspan_t<_Offset, _Count>{data() + _Offset, _Count == dynamic_extent ? size() - _Offset : _Count};
   }
 
-  constexpr span<element_type, dynamic_extent> _LIBCUDACXX_INLINE_VISIBILITY
+  constexpr span<element_type, dynamic_extent> _LIBCUDACXX_HIDE_FROM_ABI
   subspan(size_type __offset, size_type __count = dynamic_extent) const noexcept
   {
     _LIBCUDACXX_ASSERT(__offset <= size(), "span<T>::subspan(offset, count): offset out of range");
@@ -709,66 +709,66 @@ public:
     return {data() + __offset, __count};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_type size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type size() const noexcept
   {
     return __size_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_type size_bytes() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type size_bytes() const noexcept
   {
     return __size_ * sizeof(element_type);
   }
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY constexpr bool empty() const noexcept
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const noexcept
   {
     return __size_ == 0;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr reference operator[](size_type __idx) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference operator[](size_type __idx) const noexcept
   {
     _LIBCUDACXX_ASSERT(__idx < size(), "span<T>::operator[](index): index out of range");
     return __data_[__idx];
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr reference front() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference front() const noexcept
   {
     _LIBCUDACXX_ASSERT(!empty(), "span<T>::front() on empty span");
     return __data_[0];
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr reference back() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference back() const noexcept
   {
     _LIBCUDACXX_ASSERT(!empty(), "span<T>::back() on empty span");
     return __data_[size() - 1];
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr pointer data() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer data() const noexcept
   {
     return __data_;
   }
 
   // [span.iter], span iterator support
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr iterator begin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator begin() const noexcept
   {
     return iterator(data());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr iterator end() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator end() const noexcept
   {
     return iterator(data() + size());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 reverse_iterator rbegin() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 reverse_iterator rbegin() const noexcept
   {
     return reverse_iterator(end());
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX17 reverse_iterator rend() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX17 reverse_iterator rend() const noexcept
   {
     return reverse_iterator(begin());
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY span<const byte, dynamic_extent> __as_bytes() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI span<const byte, dynamic_extent> __as_bytes() const noexcept
   {
     return {reinterpret_cast<const byte*>(data()), size_bytes()};
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY span<byte, dynamic_extent> __as_writable_bytes() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI span<byte, dynamic_extent> __as_writable_bytes() const noexcept
   {
     return {reinterpret_cast<byte*>(data()), size_bytes()};
   }
@@ -780,14 +780,14 @@ private:
 
 //  as_bytes & as_writable_bytes
 template <class _Tp, size_t _Extent>
-_LIBCUDACXX_INLINE_VISIBILITY auto as_bytes(span<_Tp, _Extent> __s) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI auto as_bytes(span<_Tp, _Extent> __s) noexcept
 {
   return __s.__as_bytes();
 }
 
 _LIBCUDACXX_TEMPLATE(class _Tp, size_t _Extent)
 _LIBCUDACXX_REQUIRES((!is_const<_Tp>::value))
-_LIBCUDACXX_INLINE_VISIBILITY auto as_writable_bytes(span<_Tp, _Extent> __s) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI auto as_writable_bytes(span<_Tp, _Extent> __s) noexcept
 {
   return __s.__as_writable_bytes();
 }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/stdexcept b/libcudacxx/include/cuda/std/detail/libcxx/include/stdexcept
index 9208d16734..0c5ee77e8f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/stdexcept
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/stdexcept
@@ -25,103 +25,103 @@
 #include <cuda/std/__exception/terminate.h>
 #include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
 #  include <stdexcept>
 #endif // _LIBCUDACXX_HAS_STRING
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_runtime_error(const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_runtime_error(const char* __msg)
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::std::runtime_error(__msg);), ((void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __msg;
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_logic_error(const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_logic_error(const char* __msg)
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::std::logic_error(__msg);), ((void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __msg;
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_domain_error(const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_domain_error(const char* __msg)
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::std::domain_error(__msg);), ((void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __msg;
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_invalid_argument(const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_invalid_argument(const char* __msg)
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(
     NV_IS_HOST, (throw ::std::invalid_argument(__msg);), ((void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __msg;
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_length_error(const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_length_error(const char* __msg)
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::std::length_error(__msg);), ((void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __msg;
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_out_of_range(const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_out_of_range(const char* __msg)
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::std::out_of_range(__msg);), ((void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __msg;
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_range_error(const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_range_error(const char* __msg)
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(NV_IS_HOST, (throw ::std::range_error(__msg);), ((void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __msg;
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_overflow_error(const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_overflow_error(const char* __msg)
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(
     NV_IS_HOST, (throw ::std::overflow_error(__msg);), ((void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __msg;
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_underflow_error(const char* __msg)
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_underflow_error(const char* __msg)
 {
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(
     NV_IS_HOST, (throw ::std::underflow_error(__msg);), ((void) __msg; _CUDA_VSTD_NOVERSION::terminate();))
-#else
+#else // ^^^ !_CCCL_NO_EXCEPTIONS ^^^ / vvv _CCCL_NO_EXCEPTIONS vvv
   (void) __msg;
   _CUDA_VSTD_NOVERSION::terminate();
-#endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#endif // _CCCL_NO_EXCEPTIONS
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/string.h b/libcudacxx/include/cuda/std/detail/libcxx/include/string.h
index 7299619110..9639c81061 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/string.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/string.h
@@ -76,70 +76,67 @@ size_t strlen(const char* s);
 #if defined(__cplusplus) && !defined(_LIBCUDACXX_STRING_H_HAS_CONST_OVERLOADS) \
   && defined(_LIBCUDACXX_PREFERRED_OVERLOAD)
 extern "C++" {
-inline _LIBCUDACXX_INLINE_VISIBILITY char* __libcpp_strchr(const char* __s, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI char* __libcpp_strchr(const char* __s, int __c)
 {
   return (char*) strchr(__s, __c);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD const char* strchr(const char* __s, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD const char* strchr(const char* __s, int __c)
 {
   return __libcpp_strchr(__s, __c);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD char* strchr(char* __s, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD char* strchr(char* __s, int __c)
 {
   return __libcpp_strchr(__s, __c);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY char* __libcpp_strpbrk(const char* __s1, const char* __s2)
+_LIBCUDACXX_HIDE_FROM_ABI char* __libcpp_strpbrk(const char* __s1, const char* __s2)
 {
   return (char*) strpbrk(__s1, __s2);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD const char*
-strpbrk(const char* __s1, const char* __s2)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD const char* strpbrk(const char* __s1, const char* __s2)
 {
   return __libcpp_strpbrk(__s1, __s2);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD char* strpbrk(char* __s1, const char* __s2)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD char* strpbrk(char* __s1, const char* __s2)
 {
   return __libcpp_strpbrk(__s1, __s2);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY char* __libcpp_strrchr(const char* __s, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI char* __libcpp_strrchr(const char* __s, int __c)
 {
   return (char*) strrchr(__s, __c);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD const char* strrchr(const char* __s, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD const char* strrchr(const char* __s, int __c)
 {
   return __libcpp_strrchr(__s, __c);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD char* strrchr(char* __s, int __c)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD char* strrchr(char* __s, int __c)
 {
   return __libcpp_strrchr(__s, __c);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY void* __libcpp_memchr(const void* __s, int __c, size_t __n)
+_LIBCUDACXX_HIDE_FROM_ABI void* __libcpp_memchr(const void* __s, int __c, size_t __n)
 {
   return (void*) memchr(__s, __c, __n);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD const void*
-memchr(const void* __s, int __c, size_t __n)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD const void* memchr(const void* __s, int __c, size_t __n)
 {
   return __libcpp_memchr(__s, __c, __n);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD void* memchr(void* __s, int __c, size_t __n)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD void* memchr(void* __s, int __c, size_t __n)
 {
   return __libcpp_memchr(__s, __c, __n);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY char* __libcpp_strstr(const char* __s1, const char* __s2)
+_LIBCUDACXX_HIDE_FROM_ABI char* __libcpp_strstr(const char* __s1, const char* __s2)
 {
   return (char*) strstr(__s1, __s2);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD const char*
-strstr(const char* __s1, const char* __s2)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD const char* strstr(const char* __s1, const char* __s2)
 {
   return __libcpp_strstr(__s1, __s2);
 }
-inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_PREFERRED_OVERLOAD char* strstr(char* __s1, const char* __s2)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_PREFERRED_OVERLOAD char* strstr(char* __s1, const char* __s2)
 {
   return __libcpp_strstr(__s1, __s2);
 }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/ibm/limits.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/ibm/limits.h
deleted file mode 100644
index 8f1a0ed010..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/ibm/limits.h
+++ /dev/null
@@ -1,98 +0,0 @@
-// -*- C++ -*-
-//===--------------------- support/ibm/limits.h ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_SUPPORT_IBM_LIMITS_H
-#define _LIBCUDACXX_SUPPORT_IBM_LIMITS_H
-
-#if !defined(_AIX) // Linux
-#  include <math.h> // for HUGE_VAL, HUGE_VALF, HUGE_VALL, and NAN
-
-static const unsigned int _QNAN_F = 0x7fc00000;
-#  define NANF (*((float*) (&_QNAN_F)))
-static const unsigned int _QNAN_LDBL128[4] = {0x7ff80000, 0x0, 0x0, 0x0};
-#  define NANL (*((long double*) (&_QNAN_LDBL128)))
-static const unsigned int _SNAN_F = 0x7f855555;
-#  define NANSF (*((float*) (&_SNAN_F)))
-static const unsigned int _SNAN_D[2] = {0x7ff55555, 0x55555555};
-#  define NANS (*((double*) (&_SNAN_D)))
-static const unsigned int _SNAN_LDBL128[4] = {0x7ff55555, 0x55555555, 0x0, 0x0};
-#  define NANSL (*((long double*) (&_SNAN_LDBL128)))
-
-#  define __builtin_huge_val()     HUGE_VAL
-#  define __builtin_huge_valf()    HUGE_VALF
-#  define __builtin_huge_vall()    HUGE_VALL
-#  define __builtin_nan(__dummy)   NAN
-#  define __builtin_nanf(__dummy)  NANF
-#  define __builtin_nanl(__dummy)  NANL
-#  define __builtin_nans(__dummy)  NANS
-#  define __builtin_nansf(__dummy) NANSF
-#  define __builtin_nansl(__dummy) NANSL
-
-#else
-
-#  include <float.h> // limit constants
-#  include <math.h>
-
-#  define __builtin_huge_val()     HUGE_VAL // 0x7ff0000000000000
-#  define __builtin_huge_valf()    HUGE_VALF // 0x7f800000
-#  define __builtin_huge_vall()    HUGE_VALL // 0x7ff0000000000000
-#  define __builtin_nan(__dummy)   nan(__dummy) // 0x7ff8000000000000
-#  define __builtin_nanf(__dummy)  nanf(__dummy) // 0x7ff80000
-#  define __builtin_nanl(__dummy)  nanl(__dummy) // 0x7ff8000000000000
-#  define __builtin_nans(__dummy)  DBL_SNAN // 0x7ff5555555555555
-#  define __builtin_nansf(__dummy) FLT_SNAN // 0x7f855555
-#  define __builtin_nansl(__dummy) DBL_SNAN // 0x7ff5555555555555
-
-#  define __FLT_MANT_DIG__   FLT_MANT_DIG
-#  define __FLT_DIG__        FLT_DIG
-#  define __FLT_RADIX__      FLT_RADIX
-#  define __FLT_MIN_EXP__    FLT_MIN_EXP
-#  define __FLT_MIN_10_EXP__ FLT_MIN_10_EXP
-#  define __FLT_MAX_EXP__    FLT_MAX_EXP
-#  define __FLT_MAX_10_EXP__ FLT_MAX_10_EXP
-#  define __FLT_MIN__        FLT_MIN
-#  define __FLT_MAX__        FLT_MAX
-#  define __FLT_EPSILON__    FLT_EPSILON
-// predefined by XLC on LoP
-#  define __FLT_DENORM_MIN__ 1.40129846e-45F
-
-#  define __DBL_MANT_DIG__   DBL_MANT_DIG
-#  define __DBL_DIG__        DBL_DIG
-#  define __DBL_MIN_EXP__    DBL_MIN_EXP
-#  define __DBL_MIN_10_EXP__ DBL_MIN_10_EXP
-#  define __DBL_MAX_EXP__    DBL_MAX_EXP
-#  define __DBL_MAX_10_EXP__ DBL_MAX_10_EXP
-#  define __DBL_MIN__        DBL_MIN
-#  define __DBL_MAX__        DBL_MAX
-#  define __DBL_EPSILON__    DBL_EPSILON
-// predefined by XLC on LoP
-#  define __DBL_DENORM_MIN__ 4.9406564584124654e-324
-
-#  define __LDBL_MANT_DIG__   LDBL_MANT_DIG
-#  define __LDBL_DIG__        LDBL_DIG
-#  define __LDBL_MIN_EXP__    LDBL_MIN_EXP
-#  define __LDBL_MIN_10_EXP__ LDBL_MIN_10_EXP
-#  define __LDBL_MAX_EXP__    LDBL_MAX_EXP
-#  define __LDBL_MAX_10_EXP__ LDBL_MAX_10_EXP
-#  define __LDBL_MIN__        LDBL_MIN
-#  define __LDBL_MAX__        LDBL_MAX
-#  define __LDBL_EPSILON__    LDBL_EPSILON
-// predefined by XLC on LoP
-#  if __LONGDOUBLE128
-#    define __LDBL_DENORM_MIN__ 4.94065645841246544176568792868221e-324L
-#  else
-#    define __LDBL_DENORM_MIN__ 4.9406564584124654e-324L
-#  endif
-
-// predefined by XLC on LoP
-#  define __CHAR_BIT__ 8
-
-#endif // _AIX
-
-#endif // _LIBCUDACXX_SUPPORT_IBM_LIMITS_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
index 6054b753ff..f8392629a4 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
@@ -157,6 +157,7 @@ template <class... Types>
 
 #include <cuda/std/__functional/unwrap_ref.h>
 #include <cuda/std/__fwd/array.h>
+#include <cuda/std/__memory/allocator_arg_t.h>
 #include <cuda/std/__tuple_dir/apply_cv.h>
 #include <cuda/std/__tuple_dir/make_tuple_types.h>
 #include <cuda/std/__tuple_dir/sfinae_helpers.h>
@@ -178,7 +179,6 @@ template <class... Types>
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 #include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/detail/libcxx/include/__functional_base>
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
@@ -206,7 +206,7 @@ enum class __tuple_leaf_specialization
 };
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __tuple_leaf_specialization __tuple_leaf_choose()
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __tuple_leaf_specialization __tuple_leaf_choose()
 {
   return _CCCL_TRAIT(is_empty, _Tp) && !__libcpp_is_final<_Tp>::value ? __tuple_leaf_specialization::__empty_non_final
        : __must_synthesize_assignment<_Tp>::value
@@ -218,7 +218,7 @@ template <size_t _Ip, class _Hp, __tuple_leaf_specialization = __tuple_leaf_choo
 class __tuple_leaf;
 
 template <size_t _Ip, class _Hp, __tuple_leaf_specialization _Ep>
-inline _LIBCUDACXX_INLINE_VISIBILITY void
+_LIBCUDACXX_HIDE_FROM_ABI void
 swap(__tuple_leaf<_Ip, _Hp, _Ep>& __x, __tuple_leaf<_Ip, _Hp, _Ep>& __y) noexcept(__is_nothrow_swappable<_Hp>::value)
 {
   swap(__x.get(), __y.get());
@@ -230,7 +230,7 @@ class __tuple_leaf<_Ip, _Hp, __tuple_leaf_specialization::__default>
   _Hp __value_;
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr bool __can_bind_reference()
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __can_bind_reference()
   {
 #if __has_keyword(__reference_binds_to_temporary)
     return !__reference_binds_to_temporary(_Hp, _Tp);
@@ -240,27 +240,27 @@ class __tuple_leaf<_Ip, _Hp, __tuple_leaf_specialization::__default>
   }
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __tuple_leaf() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Hp))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __tuple_leaf() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Hp))
       : __value_()
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __tuple_leaf(__tuple_leaf_default_constructor_tag) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __tuple_leaf(__tuple_leaf_default_constructor_tag) noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _Hp))
       : __value_()
   {}
 
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf(integral_constant<int, 0>, const _Alloc&)
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf(integral_constant<int, 0>, const _Alloc&)
       : __value_()
   {}
 
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf(integral_constant<int, 1>, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf(integral_constant<int, 1>, const _Alloc& __a)
       : __value_(allocator_arg_t(), __a)
   {}
 
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf(integral_constant<int, 2>, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf(integral_constant<int, 2>, const _Alloc& __a)
       : __value_(__a)
   {}
 
@@ -268,45 +268,44 @@ public:
   using __can_forward = _And<_IsNotSame<__remove_cvref_t<_Tp>, __tuple_leaf>, is_constructible<_Hp, _Tp>>;
 
   template <class _Tp, __enable_if_t<__can_forward<_Tp>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit __tuple_leaf(_Tp&& __t) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Hp, _Tp))
       : __value_(_CUDA_VSTD::forward<_Tp>(__t))
   {}
 
   template <class _Tp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __tuple_leaf(integral_constant<int, 0>, const _Alloc&, _Tp&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __tuple_leaf(integral_constant<int, 0>, const _Alloc&, _Tp&& __t)
       : __value_(_CUDA_VSTD::forward<_Tp>(__t))
   {}
 
   template <class _Tp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __tuple_leaf(integral_constant<int, 1>, const _Alloc& __a, _Tp&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __tuple_leaf(integral_constant<int, 1>, const _Alloc& __a, _Tp&& __t)
       : __value_(allocator_arg_t(), __a, _CUDA_VSTD::forward<_Tp>(__t))
   {}
 
   template <class _Tp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __tuple_leaf(integral_constant<int, 2>, const _Alloc& __a, _Tp&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __tuple_leaf(integral_constant<int, 2>, const _Alloc& __a, _Tp&& __t)
       : __value_(_CUDA_VSTD::forward<_Tp>(__t), __a)
   {}
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf&
-  operator=(_Tp&& __t) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _Hp&, _Tp))
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf& operator=(_Tp&& __t) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _Hp&, _Tp))
   {
     __value_ = _CUDA_VSTD::forward<_Tp>(__t);
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY int swap(__tuple_leaf& __t) noexcept(__is_nothrow_swappable<__tuple_leaf>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI int swap(__tuple_leaf& __t) noexcept(__is_nothrow_swappable<__tuple_leaf>::value)
   {
     _CUDA_VSTD::swap(*this, __t);
     return 0;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Hp& get() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Hp& get() noexcept
   {
     return __value_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Hp& get() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Hp& get() const noexcept
   {
     return __value_;
   }
@@ -318,7 +317,7 @@ class __tuple_leaf<_Ip, _Hp, __tuple_leaf_specialization::__synthesize_assignmen
   _Hp __value_;
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr bool __can_bind_reference()
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __can_bind_reference()
   {
 #if __has_keyword(__reference_binds_to_temporary)
     return !__reference_binds_to_temporary(_Hp, _Tp);
@@ -328,7 +327,7 @@ class __tuple_leaf<_Ip, _Hp, __tuple_leaf_specialization::__synthesize_assignmen
   }
 
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __tuple_leaf() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Hp))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __tuple_leaf() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Hp))
       : __value_()
   {
     static_assert(!_CCCL_TRAIT(is_reference, _Hp), "Attempted to default construct a reference element in a tuple");
@@ -338,7 +337,7 @@ public:
   using __can_forward = _And<_IsNotSame<__remove_cvref_t<_Tp>, __tuple_leaf>, is_constructible<_Hp, _Tp>>;
 
   template <class _Tp, __enable_if_t<__can_forward<_Tp>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit __tuple_leaf(_Tp&& __t) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Hp, _Tp))
       : __value_(_CUDA_VSTD::forward<_Tp>(__t))
   {
@@ -348,7 +347,7 @@ public:
   }
 
   template <class _Tp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __tuple_leaf(integral_constant<int, 0>, const _Alloc&, _Tp&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __tuple_leaf(integral_constant<int, 0>, const _Alloc&, _Tp&& __t)
       : __value_(_CUDA_VSTD::forward<_Tp>(__t))
   {
     static_assert(__can_bind_reference<_Tp&&>(),
@@ -356,39 +355,38 @@ public:
                   "temporary whose lifetime has ended");
   }
 
-  __tuple_leaf(const __tuple_leaf& __t) = default;
-  __tuple_leaf(__tuple_leaf&& __t)      = default;
+  _CCCL_HIDE_FROM_ABI __tuple_leaf(const __tuple_leaf& __t) = default;
+  _CCCL_HIDE_FROM_ABI __tuple_leaf(__tuple_leaf&& __t)      = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_leaf& operator=(const __tuple_leaf& __t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_leaf& operator=(const __tuple_leaf& __t) noexcept
   {
     __value_ = __t.__value_;
     return *this;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_leaf& operator=(__tuple_leaf&& __t) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_leaf& operator=(__tuple_leaf&& __t) noexcept
   {
     __value_ = _CUDA_VSTD::move(__t.__value_);
     return *this;
   }
 
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf&
-  operator=(_Tp&& __t) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _Hp&, _Tp))
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf& operator=(_Tp&& __t) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _Hp&, _Tp))
   {
     __value_ = _CUDA_VSTD::forward<_Tp>(__t);
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY int swap(__tuple_leaf& __t) noexcept(__is_nothrow_swappable<__tuple_leaf>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI int swap(__tuple_leaf& __t) noexcept(__is_nothrow_swappable<__tuple_leaf>::value)
   {
     _CUDA_VSTD::swap(*this, __t);
     return 0;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Hp& get() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Hp& get() noexcept
   {
     return __value_;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Hp& get() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Hp& get() const noexcept
   {
     return __value_;
   }
@@ -398,24 +396,24 @@ template <size_t _Ip, class _Hp>
 class __tuple_leaf<_Ip, _Hp, __tuple_leaf_specialization::__empty_non_final> : private _Hp
 {
 public:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __tuple_leaf() noexcept(is_nothrow_default_constructible<_Hp>::value) {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __tuple_leaf() noexcept(is_nothrow_default_constructible<_Hp>::value) {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __tuple_leaf(__tuple_leaf_default_constructor_tag) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __tuple_leaf(__tuple_leaf_default_constructor_tag) noexcept(
     _CCCL_TRAIT(is_nothrow_default_constructible, _Hp))
       : _Hp()
   {}
 
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf(integral_constant<int, 0>, const _Alloc&)
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf(integral_constant<int, 0>, const _Alloc&)
   {}
 
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf(integral_constant<int, 1>, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf(integral_constant<int, 1>, const _Alloc& __a)
       : _Hp(allocator_arg_t(), __a)
   {}
 
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf(integral_constant<int, 2>, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf(integral_constant<int, 2>, const _Alloc& __a)
       : _Hp(__a)
   {}
 
@@ -423,28 +421,28 @@ public:
   using __can_forward = _And<_IsNotSame<__remove_cvref_t<_Tp>, __tuple_leaf>, is_constructible<_Hp, _Tp>>;
 
   template <class _Tp, __enable_if_t<__can_forward<_Tp>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit __tuple_leaf(_Tp&& __t) noexcept((is_nothrow_constructible<_Hp, _Tp>::value))
       : _Hp(_CUDA_VSTD::forward<_Tp>(__t))
   {}
 
   template <class _Tp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __tuple_leaf(integral_constant<int, 0>, const _Alloc&, _Tp&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __tuple_leaf(integral_constant<int, 0>, const _Alloc&, _Tp&& __t)
       : _Hp(_CUDA_VSTD::forward<_Tp>(__t))
   {}
 
   template <class _Tp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __tuple_leaf(integral_constant<int, 1>, const _Alloc& __a, _Tp&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __tuple_leaf(integral_constant<int, 1>, const _Alloc& __a, _Tp&& __t)
       : _Hp(allocator_arg_t(), __a, _CUDA_VSTD::forward<_Tp>(__t))
   {}
 
   template <class _Tp, class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __tuple_leaf(integral_constant<int, 2>, const _Alloc& __a, _Tp&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __tuple_leaf(integral_constant<int, 2>, const _Alloc& __a, _Tp&& __t)
       : _Hp(_CUDA_VSTD::forward<_Tp>(__t), __a)
   {}
 
   template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_assignable, _Hp&, const _Tp&), int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf&
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf&
   operator=(const _Tp& __t) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _Hp&, const _Tp&))
   {
     _Hp::operator=(__t);
@@ -452,31 +450,30 @@ public:
   }
 
   template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_assignable, _Hp&, _Tp), int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_leaf&
-  operator=(_Tp&& __t) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _Hp&, _Tp))
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_leaf& operator=(_Tp&& __t) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _Hp&, _Tp))
   {
     _Hp::operator=(_CUDA_VSTD::forward<_Tp>(__t));
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY int swap(__tuple_leaf& __t) noexcept(__is_nothrow_swappable<__tuple_leaf>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI int swap(__tuple_leaf& __t) noexcept(__is_nothrow_swappable<__tuple_leaf>::value)
   {
     _CUDA_VSTD::swap(*this, __t);
     return 0;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Hp& get() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Hp& get() noexcept
   {
     return static_cast<_Hp&>(*this);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const _Hp& get() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Hp& get() const noexcept
   {
     return static_cast<const _Hp&>(*this);
   }
 };
 
 template <class... _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void __swallow(_Tp&&...) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI void __swallow(_Tp&&...) noexcept
 {}
 
 template <class _Tp>
@@ -502,7 +499,7 @@ struct _LIBCUDACXX_DECLSPEC_EMPTY_BASES __tuple_impl<__tuple_indices<_Indx...>,
                            __all<_CCCL_TRAIT(is_copy_assignable, _Tp)...>::value,
                            __all<_CCCL_TRAIT(is_move_assignable, _Tp)...>::value>
 {
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __tuple_impl() noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __tuple_impl() noexcept(
     __all<_CCCL_TRAIT(is_nothrow_default_constructible, _Tp)...>::value)
       : __tuple_leaf<_Indx, _Tp>()...
   {}
@@ -512,7 +509,7 @@ struct _LIBCUDACXX_DECLSPEC_EMPTY_BASES __tuple_impl<__tuple_indices<_Indx...>,
   template <class... _Up,
             __enable_if_t<sizeof...(_Up) == sizeof...(_Tp), int> = 0,
             bool __all_nothrow_constructible = __all<_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Up)...>::value>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit __tuple_impl(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit __tuple_impl(
     __tuple_variadic_constructor_tag, _Up&&... __u) noexcept(__all_nothrow_constructible)
       : __tuple_leaf<_Indx, _Tp>(_CUDA_VSTD::forward<_Up>(__u))...
   {}
@@ -520,7 +517,7 @@ struct _LIBCUDACXX_DECLSPEC_EMPTY_BASES __tuple_impl<__tuple_indices<_Indx...>,
   // Handle non-allocator, partial default initialization
   // Recursively delegate until we have full rank
   template <class... _Up, __enable_if_t<sizeof...(_Up) < sizeof...(_Tp), int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit __tuple_impl(__tuple_variadic_constructor_tag __tag, _Up&&... __u) noexcept(
     noexcept(__tuple_impl(__tag, _CUDA_VSTD::forward<_Up>(__u)..., __tuple_leaf_default_constructor_tag{})))
       : __tuple_impl(__tag, _CUDA_VSTD::forward<_Up>(__u)..., __tuple_leaf_default_constructor_tag{})
@@ -528,14 +525,14 @@ struct _LIBCUDACXX_DECLSPEC_EMPTY_BASES __tuple_impl<__tuple_indices<_Indx...>,
 
   // Handle allocator aware, full initialization
   template <class _Alloc, class... _Up, __enable_if_t<sizeof...(_Up) == sizeof...(_Tp), int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __tuple_impl(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __tuple_impl(
     allocator_arg_t, const _Alloc& __a, __tuple_variadic_constructor_tag, _Up&&... __u)
       : __tuple_leaf<_Indx, _Tp>(__uses_alloc_ctor<_Tp, _Alloc, _Up>(), __a, _CUDA_VSTD::forward<_Up>(__u))...
   {}
 
   // Handle allocator aware, full default initialization
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit __tuple_impl(allocator_arg_t, const _Alloc& __a)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit __tuple_impl(allocator_arg_t, const _Alloc& __a)
       : __tuple_leaf<_Indx, _Tp>(__uses_alloc_ctor<_Tp, _Alloc>(), __a)...
   {}
 
@@ -543,20 +540,20 @@ struct _LIBCUDACXX_DECLSPEC_EMPTY_BASES __tuple_impl<__tuple_indices<_Indx...>,
   using __tuple_elem_at = __tuple_element_t<_Indx2, __make_tuple_types_t<_Tuple>>;
 
   template <class _Tuple, __enable_if_t<__tuple_constructible<_Tuple, tuple<_Tp...>>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_impl(_Tuple&& __t) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_impl(_Tuple&& __t) noexcept(
     (__all<_CCCL_TRAIT(is_nothrow_constructible, _Tp, __tuple_elem_at<_Tuple, _Indx>)...>::value))
       : __tuple_leaf<_Indx, _Tp>(_CUDA_VSTD::forward<__tuple_elem_at<_Tuple, _Indx>>(_CUDA_VSTD::get<_Indx>(__t)))...
   {}
 
   template <class _Alloc, class _Tuple, __enable_if_t<__tuple_constructible<_Tuple, tuple<_Tp...>>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_impl(allocator_arg_t, const _Alloc& __a, _Tuple&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_impl(allocator_arg_t, const _Alloc& __a, _Tuple&& __t)
       : __tuple_leaf<_Indx, _Tp>(__uses_alloc_ctor<_Tp, _Alloc, __tuple_elem_at<_Tuple, _Indx>>(),
                                  __a,
                                  _CUDA_VSTD::forward<__tuple_elem_at<_Tuple, _Indx>>(_CUDA_VSTD::get<_Indx>(__t)))...
   {}
 
   template <class _Tuple, __enable_if_t<__tuple_assignable<_Tuple, tuple<_Tp...>>::value, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY __tuple_impl& operator=(_Tuple&& __t) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI __tuple_impl& operator=(_Tuple&& __t) noexcept(
     (__all<_CCCL_TRAIT(is_nothrow_assignable, _Tp&, __tuple_elem_at<_Tuple, _Indx>)...>::value))
   {
     _CUDA_VSTD::__swallow(__tuple_leaf<_Indx, _Tp>::operator=(
@@ -564,13 +561,12 @@ struct _LIBCUDACXX_DECLSPEC_EMPTY_BASES __tuple_impl<__tuple_indices<_Indx...>,
     return *this;
   }
 
-  __tuple_impl(const __tuple_impl&)            = default;
-  __tuple_impl(__tuple_impl&&)                 = default;
-  __tuple_impl& operator=(const __tuple_impl&) = default;
-  __tuple_impl& operator=(__tuple_impl&&)      = default;
+  _CCCL_HIDE_FROM_ABI __tuple_impl(const __tuple_impl&)            = default;
+  _CCCL_HIDE_FROM_ABI __tuple_impl(__tuple_impl&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __tuple_impl& operator=(const __tuple_impl&) = default;
+  _CCCL_HIDE_FROM_ABI __tuple_impl& operator=(__tuple_impl&&)      = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY void
-  swap(__tuple_impl& __t) noexcept(__all<__is_nothrow_swappable<_Tp>::value...>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI void swap(__tuple_impl& __t) noexcept(__all<__is_nothrow_swappable<_Tp>::value...>::value)
   {
     _CUDA_VSTD::__swallow(__tuple_leaf<_Indx, _Tp>::swap(static_cast<__tuple_leaf<_Indx, _Tp>&>(__t))...);
   }
@@ -685,7 +681,7 @@ struct __tuple_constraints
 };
 
 template <class... _Tp>
-class _LIBCUDACXX_TEMPLATE_VIS tuple
+class _CCCL_TYPE_VISIBILITY_DEFAULT tuple
 {
   typedef __tuple_impl<__make_tuple_indices_t<sizeof...(_Tp)>, _Tp...> _BaseT;
 
@@ -701,28 +697,28 @@ class _LIBCUDACXX_TEMPLATE_VIS tuple
 
 public:
   template <size_t _Ip>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple>& __get_impl() & noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple>& __get_impl() & noexcept
   {
     typedef _LIBCUDACXX_NODEBUG_TYPE __tuple_element_t<_Ip, tuple> type;
     return static_cast<__tuple_leaf<_Ip, type>&>(__base_).get();
   }
 
   template <size_t _Ip>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple>& __get_impl() const& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple>& __get_impl() const& noexcept
   {
     typedef _LIBCUDACXX_NODEBUG_TYPE __tuple_element_t<_Ip, tuple> type;
     return static_cast<const __tuple_leaf<_Ip, type>&>(__base_).get();
   }
 
   template <size_t _Ip>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple>&& __get_impl() && noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple>&& __get_impl() && noexcept
   {
     typedef _LIBCUDACXX_NODEBUG_TYPE __tuple_element_t<_Ip, tuple> type;
     return static_cast<type&&>(static_cast<__tuple_leaf<_Ip, type>&&>(__base_).get());
   }
 
   template <size_t _Ip>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple>&& __get_impl() const&& noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple>&& __get_impl() const&& noexcept
   {
     typedef _LIBCUDACXX_NODEBUG_TYPE __tuple_element_t<_Ip, tuple> type;
     return static_cast<const type&&>(static_cast<const __tuple_leaf<_Ip, type>&&>(__base_).get());
@@ -730,24 +726,23 @@ public:
 
   template <class _Constraints                                                 = __tuple_constraints<_Tp...>,
             __enable_if_t<_Constraints::__implicit_default_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr tuple() noexcept(_Constraints::__nothrow_default_constructible)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr tuple() noexcept(_Constraints::__nothrow_default_constructible)
   {}
 
   template <class _Constraints                                                 = __tuple_constraints<_Tp...>,
             __enable_if_t<_Constraints::__explicit_default_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr tuple() noexcept(_Constraints::__nothrow_default_constructible)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr tuple() noexcept(_Constraints::__nothrow_default_constructible)
   {}
 
-  tuple(tuple const&) = default;
-  tuple(tuple&&)      = default;
+  _CCCL_HIDE_FROM_ABI tuple(tuple const&) = default;
+  _CCCL_HIDE_FROM_ABI tuple(tuple&&)      = default;
 
   template <class _AllocArgT,
             class _Alloc,
             class _Constraints                                                    = __tuple_constraints<_Tp...>,
             __enable_if_t<_CCCL_TRAIT(is_same, allocator_arg_t, _AllocArgT), int> = 0,
             __enable_if_t<_Constraints::__implicit_default_constructible, int>    = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY
-  tuple(_AllocArgT, _Alloc const& __a) noexcept(_Constraints::__nothrow_default_constructible)
+  _LIBCUDACXX_HIDE_FROM_ABI tuple(_AllocArgT, _Alloc const& __a) noexcept(_Constraints::__nothrow_default_constructible)
       : __base_(allocator_arg_t(), __a)
   {}
 
@@ -756,21 +751,21 @@ public:
             class _Constraints                                                    = __tuple_constraints<_Tp...>,
             __enable_if_t<_CCCL_TRAIT(is_same, allocator_arg_t, _AllocArgT), int> = 0,
             __enable_if_t<_Constraints::__explicit_default_constructible, int>    = 0>
-  explicit _LIBCUDACXX_INLINE_VISIBILITY
+  explicit _LIBCUDACXX_HIDE_FROM_ABI
   tuple(_AllocArgT, _Alloc const& __a) noexcept(_Constraints::__nothrow_default_constructible)
       : __base_(allocator_arg_t(), __a)
   {}
 
   template <class _Constraints                                                       = __tuple_constraints<_Tp...>,
             __enable_if_t<_Constraints::__implicit_variadic_copy_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   tuple(const _Tp&... __t) noexcept(_Constraints::__nothrow_variadic_copy_constructible)
       : __base_(__tuple_variadic_constructor_tag{}, __t...)
   {}
 
   template <class _Constraints                                                       = __tuple_constraints<_Tp...>,
             __enable_if_t<_Constraints::__explicit_variadic_copy_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit tuple(const _Tp&... __t) noexcept(_Constraints::__nothrow_variadic_copy_constructible)
       : __base_(__tuple_variadic_constructor_tag{}, __t...)
   {}
@@ -778,7 +773,7 @@ public:
   template <class _Alloc,
             class _Constraints                                                       = __tuple_constraints<_Tp...>,
             __enable_if_t<_Constraints::__implicit_variadic_copy_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY tuple(allocator_arg_t, const _Alloc& __a, const _Tp&... __t) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI tuple(allocator_arg_t, const _Alloc& __a, const _Tp&... __t) noexcept(
     _Constraints::__nothrow_variadic_copy_constructible)
       : __base_(allocator_arg_t(), __a, __tuple_variadic_constructor_tag{}, __t...)
   {}
@@ -786,7 +781,7 @@ public:
   template <class _Alloc,
             class _Constraints                                                       = __tuple_constraints<_Tp...>,
             __enable_if_t<_Constraints::__explicit_variadic_copy_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit tuple(allocator_arg_t, const _Alloc& __a, const _Tp&... __t) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit tuple(allocator_arg_t, const _Alloc& __a, const _Tp&... __t) noexcept(
     _Constraints::__nothrow_variadic_copy_constructible)
       : __base_(allocator_arg_t(), __a, __tuple_variadic_constructor_tag{}, __t...)
   {}
@@ -808,14 +803,14 @@ public:
   template <class... _Up,
             class _Constraints                                         = __variadic_constraints<_Up...>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 tuple(_Up&&... __u) noexcept(_Constraints::__nothrow_constructible)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple(_Up&&... __u) noexcept(_Constraints::__nothrow_constructible)
       : __base_(__tuple_variadic_constructor_tag{}, _CUDA_VSTD::forward<_Up>(__u)...)
   {}
 
   template <class... _Up,
             class _Constraints                                         = __variadic_constraints<_Up...>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit tuple(_Up&&... __u) noexcept(_Constraints::__nothrow_constructible)
       : __base_(__tuple_variadic_constructor_tag{}, _CUDA_VSTD::forward<_Up>(__u)...)
   {}
@@ -830,7 +825,7 @@ public:
             class _Constraints                                         = __variadic_constraints_less_rank<_Up...>,
             __enable_if_t<sizeof...(_Up) < sizeof...(_Tp), int>        = 0,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit tuple(_Up&&... __u) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit tuple(_Up&&... __u) noexcept(
     __base_noexcept_constructible<__tuple_variadic_constructor_tag, _Up...>::value)
       : __base_(__tuple_variadic_constructor_tag{}, _CUDA_VSTD::forward<_Up>(__u)...)
   {}
@@ -839,7 +834,7 @@ public:
             class... _Up,
             class _Constraints                                         = __variadic_constraints<_Up...>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   tuple(allocator_arg_t, const _Alloc& __a, _Up&&... __u) noexcept(_Constraints::__nothrow_constructible)
       : __base_(allocator_arg_t(), __a, __tuple_variadic_constructor_tag{}, _CUDA_VSTD::forward<_Up>(__u)...)
   {}
@@ -848,7 +843,7 @@ public:
             class... _Up,
             class _Constraints                                         = __variadic_constraints<_Up...>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit tuple(allocator_arg_t, const _Alloc& __a, _Up&&... __u) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit tuple(allocator_arg_t, const _Alloc& __a, _Up&&... __u) noexcept(
     _Constraints::__nothrow_constructible)
       : __base_(allocator_arg_t(), __a, __tuple_variadic_constructor_tag{}, _CUDA_VSTD::forward<_Up>(__u)...)
   {}
@@ -864,7 +859,7 @@ public:
             __enable_if_t<!_PackExpandsToThisTuple<_Tuple>::value, int>   = 0,
             __enable_if_t<!_CCCL_TRAIT(is_lvalue_reference, _Tuple), int> = 0,
             __enable_if_t<_Constraints::__implicit_constructible, int>    = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   tuple(_Tuple&& __t) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _BaseT, _Tuple))
       : __base_(_CUDA_VSTD::forward<_Tuple>(__t))
   {}
@@ -873,7 +868,7 @@ public:
             class _Constraints                                          = __tuple_like_constraints<const _Tuple&>,
             __enable_if_t<!_PackExpandsToThisTuple<_Tuple>::value, int> = 0,
             __enable_if_t<_Constraints::__implicit_constructible, int>  = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   tuple(const _Tuple& __t) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _BaseT, const _Tuple&))
       : __base_(__t)
   {}
@@ -883,7 +878,7 @@ public:
             __enable_if_t<!_PackExpandsToThisTuple<_Tuple>::value, int>   = 0,
             __enable_if_t<!_CCCL_TRAIT(is_lvalue_reference, _Tuple), int> = 0,
             __enable_if_t<_Constraints::__explicit_constructible, int>    = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY
+  _LIBCUDACXX_HIDE_FROM_ABI
   _CCCL_CONSTEXPR_CXX14 explicit tuple(_Tuple&& __t) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _BaseT, _Tuple))
       : __base_(_CUDA_VSTD::forward<_Tuple>(__t))
   {}
@@ -892,7 +887,7 @@ public:
             class _Constraints                                          = __tuple_like_constraints<const _Tuple&>,
             __enable_if_t<!_PackExpandsToThisTuple<_Tuple>::value, int> = 0,
             __enable_if_t<_Constraints::__explicit_constructible, int>  = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit tuple(const _Tuple& __t) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit tuple(const _Tuple& __t) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _BaseT, const _Tuple&))
       : __base_(__t)
   {}
@@ -901,7 +896,7 @@ public:
             class _Tuple,
             class _Constraints                                         = __tuple_like_constraints<_Tuple>,
             __enable_if_t<_Constraints::__implicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY tuple(allocator_arg_t, const _Alloc& __a, _Tuple&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI tuple(allocator_arg_t, const _Alloc& __a, _Tuple&& __t)
       : __base_(allocator_arg_t(), __a, _CUDA_VSTD::forward<_Tuple>(__t))
   {}
 
@@ -909,48 +904,47 @@ public:
             class _Tuple,
             class _Constraints                                         = __tuple_like_constraints<_Tuple>,
             __enable_if_t<_Constraints::__explicit_constructible, int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit tuple(allocator_arg_t, const _Alloc& __a, _Tuple&& __t)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit tuple(allocator_arg_t, const _Alloc& __a, _Tuple&& __t)
       : __base_(allocator_arg_t(), __a, _CUDA_VSTD::forward<_Tuple>(__t))
   {}
 
   using _CanCopyAssign = __all<_CCCL_TRAIT(is_copy_assignable, _Tp)...>;
   using _CanMoveAssign = __all<_CCCL_TRAIT(is_move_assignable, _Tp)...>;
 
-  tuple& operator=(const tuple& __t) = default;
-  tuple& operator=(tuple&& __t)      = default;
+  _CCCL_HIDE_FROM_ABI tuple& operator=(const tuple& __t) = default;
+  _CCCL_HIDE_FROM_ABI tuple& operator=(tuple&& __t)      = default;
 
   template <class _Tuple, __enable_if_t<__tuple_assignable<_Tuple, tuple>::value, bool> = false>
-  _LIBCUDACXX_INLINE_VISIBILITY tuple&
-  operator=(_Tuple&& __t) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _BaseT&, _Tuple))
+  _LIBCUDACXX_HIDE_FROM_ABI tuple& operator=(_Tuple&& __t) noexcept(_CCCL_TRAIT(is_nothrow_assignable, _BaseT&, _Tuple))
   {
     __base_.operator=(_CUDA_VSTD::forward<_Tuple>(__t));
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY void swap(tuple& __t) noexcept(__all<__is_nothrow_swappable<_Tp>::value...>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI void swap(tuple& __t) noexcept(__all<__is_nothrow_swappable<_Tp>::value...>::value)
   {
     __base_.swap(__t.__base_);
   }
 };
 
 template <>
-class _LIBCUDACXX_TEMPLATE_VIS tuple<>
+class _CCCL_TYPE_VISIBILITY_DEFAULT tuple<>
 {
 public:
-  constexpr tuple() noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr tuple() noexcept = default;
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY tuple(allocator_arg_t, const _Alloc&) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI tuple(allocator_arg_t, const _Alloc&) noexcept
   {}
   template <class _Alloc>
-  _LIBCUDACXX_INLINE_VISIBILITY tuple(allocator_arg_t, const _Alloc&, const tuple&) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI tuple(allocator_arg_t, const _Alloc&, const tuple&) noexcept
   {}
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY tuple(array<_Up, 0>) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI tuple(array<_Up, 0>) noexcept
   {}
   template <class _Alloc, class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY tuple(allocator_arg_t, const _Alloc&, array<_Up, 0>) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI tuple(allocator_arg_t, const _Alloc&, array<_Up, 0>) noexcept
   {}
-  _LIBCUDACXX_INLINE_VISIBILITY void swap(tuple&) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI void swap(tuple&) noexcept {}
 };
 
 #ifndef _LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES
@@ -967,7 +961,7 @@ _CCCL_HOST_DEVICE tuple(allocator_arg_t, _Alloc, tuple<_Tp...>) -> tuple<_Tp...>
 #endif // _LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES
 
 template <class... _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<_And<__is_swappable<_Tp>...>::value, void>
+_LIBCUDACXX_HIDE_FROM_ABI __enable_if_t<_And<__is_swappable<_Tp>...>::value, void>
 swap(tuple<_Tp...>& __t, tuple<_Tp...>& __u) noexcept(__all<__is_nothrow_swappable<_Tp>::value...>::value)
 {
   __t.swap(__u);
@@ -975,28 +969,26 @@ swap(tuple<_Tp...>& __t, tuple<_Tp...>& __u) noexcept(__all<__is_nothrow_swappab
 
 // get
 template <size_t _Ip, class... _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple<_Tp...>>&
-get(tuple<_Tp...>& __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple<_Tp...>>& get(tuple<_Tp...>& __t) noexcept
 {
   return __t.template __get_impl<_Ip>();
 }
 
 template <size_t _Ip, class... _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple<_Tp...>>&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple<_Tp...>>&
 get(const tuple<_Tp...>& __t) noexcept
 {
   return __t.template __get_impl<_Ip>();
 }
 
 template <size_t _Ip, class... _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple<_Tp...>>&&
-get(tuple<_Tp...>&& __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __tuple_element_t<_Ip, tuple<_Tp...>>&& get(tuple<_Tp...>&& __t) noexcept
 {
   return _CUDA_VSTD::move(__t).template __get_impl<_Ip>();
 }
 
 template <size_t _Ip, class... _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple<_Tp...>>&&
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __tuple_element_t<_Ip, tuple<_Tp...>>&&
 get(const tuple<_Tp...>&& __t) noexcept
 {
   return _CUDA_VSTD::move(__t).template __get_impl<_Ip>();
@@ -1010,13 +1002,13 @@ namespace __find_detail
 static constexpr size_t __not_found = ~size_t(0);
 static constexpr size_t __ambiguous = __not_found - 1;
 
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t __find_idx_return(size_t __curr_i, size_t __res, bool __matches)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr size_t __find_idx_return(size_t __curr_i, size_t __res, bool __matches)
 {
   return !__matches ? __res : (__res == __not_found ? __curr_i : __ambiguous);
 }
 
 template <size_t _Nx>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t __find_idx(size_t __i, const bool (&__matches)[_Nx])
+_LIBCUDACXX_HIDE_FROM_ABI constexpr size_t __find_idx(size_t __i, const bool (&__matches)[_Nx])
 {
   return __i == _Nx ? __not_found : __find_idx_return(__i, __find_idx(__i + 1, __matches), __matches[__i]);
 }
@@ -1043,25 +1035,25 @@ struct __find_exactly_one_t : public __find_detail::__find_exactly_one_checked<_
 {};
 
 template <class _T1, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1& get(tuple<_Args...>& __tup) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1& get(tuple<_Args...>& __tup) noexcept
 {
   return _CUDA_VSTD::get<__find_exactly_one_t<_T1, _Args...>::value>(__tup);
 }
 
 template <class _T1, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1 const& get(tuple<_Args...> const& __tup) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1 const& get(tuple<_Args...> const& __tup) noexcept
 {
   return _CUDA_VSTD::get<__find_exactly_one_t<_T1, _Args...>::value>(__tup);
 }
 
 template <class _T1, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1&& get(tuple<_Args...>&& __tup) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1&& get(tuple<_Args...>&& __tup) noexcept
 {
   return _CUDA_VSTD::get<__find_exactly_one_t<_T1, _Args...>::value>(_CUDA_VSTD::move(__tup));
 }
 
 template <class _T1, class... _Args>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1 const&& get(tuple<_Args...> const&& __tup) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _T1 const&& get(tuple<_Args...> const&& __tup) noexcept
 {
   return _CUDA_VSTD::get<__find_exactly_one_t<_T1, _Args...>::value>(_CUDA_VSTD::move(__tup));
 }
@@ -1071,7 +1063,7 @@ inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _T1 const&& get(tuple<_Args...> c
 // tie
 
 template <class... _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 tuple<_Tp&...> tie(_Tp&... __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple<_Tp&...> tie(_Tp&... __t) noexcept
 {
   return tuple<_Tp&...>(__t...);
 }
@@ -1080,7 +1072,7 @@ template <class _Up>
 struct __ignore_t
 {
   template <class _Tp>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 const __ignore_t& operator=(_Tp&&) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const __ignore_t& operator=(_Tp&&) const
   {
     return *this;
   }
@@ -1092,14 +1084,13 @@ _CCCL_GLOBAL_CONSTANT __ignore_t<unsigned char> ignore{};
 } // namespace
 
 template <class... _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 tuple<typename __unwrap_ref_decay<_Tp>::type...>
-make_tuple(_Tp&&... __t)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple<typename __unwrap_ref_decay<_Tp>::type...> make_tuple(_Tp&&... __t)
 {
   return tuple<typename __unwrap_ref_decay<_Tp>::type...>(_CUDA_VSTD::forward<_Tp>(__t)...);
 }
 
 template <class... _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 tuple<_Tp&&...> forward_as_tuple(_Tp&&... __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple<_Tp&&...> forward_as_tuple(_Tp&&... __t) noexcept
 {
   return tuple<_Tp&&...>(_CUDA_VSTD::forward<_Tp>(__t)...);
 }
@@ -1108,7 +1099,7 @@ template <size_t _Ip>
 struct __tuple_equal
 {
   template <class _Tp, class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y)
   {
     return __tuple_equal<_Ip - 1>()(__x, __y) && _CUDA_VSTD::get<_Ip - 1>(__x) == _CUDA_VSTD::get<_Ip - 1>(__y);
   }
@@ -1118,23 +1109,21 @@ template <>
 struct __tuple_equal<0>
 {
   template <class _Tp, class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp&, const _Up&)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp&, const _Up&)
   {
     return true;
   }
 };
 
 template <class... _Tp, class... _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator==(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
 {
   static_assert(sizeof...(_Tp) == sizeof...(_Up), "Can't compare tuples of different sizes");
   return __tuple_equal<sizeof...(_Tp)>()(__x, __y);
 }
 
 template <class... _Tp, class... _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator!=(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
 {
   return !(__x == __y);
 }
@@ -1143,7 +1132,7 @@ template <size_t _Ip>
 struct __tuple_less
 {
   template <class _Tp, class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y)
   {
     const size_t __idx = tuple_size<_Tp>::value - _Ip;
     if (_CUDA_VSTD::get<__idx>(__x) < _CUDA_VSTD::get<__idx>(__y))
@@ -1162,37 +1151,33 @@ template <>
 struct __tuple_less<0>
 {
   template <class _Tp, class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp&, const _Up&)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp&, const _Up&)
   {
     return false;
   }
 };
 
 template <class... _Tp, class... _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator<(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
 {
   static_assert(sizeof...(_Tp) == sizeof...(_Up), "Can't compare tuples of different sizes");
   return __tuple_less<sizeof...(_Tp)>()(__x, __y);
 }
 
 template <class... _Tp, class... _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator>(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
 {
   return __y < __x;
 }
 
 template <class... _Tp, class... _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator>=(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>=(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
 {
   return !(__x < __y);
 }
 
 template <class... _Tp, class... _Up>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 bool
-operator<=(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<=(const tuple<_Tp...>& __x, const tuple<_Up...>& __y)
 {
   return !(__y < __x);
 }
@@ -1242,7 +1227,7 @@ struct __tuple_cat_return<>
   typedef _LIBCUDACXX_NODEBUG_TYPE tuple<> type;
 };
 
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 tuple<> tuple_cat()
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple<> tuple_cat()
 {
   return tuple<>();
 }
@@ -1282,8 +1267,7 @@ template <class... _Types, size_t... _I0, size_t... _J0>
 struct __tuple_cat<tuple<_Types...>, __tuple_indices<_I0...>, __tuple_indices<_J0...>>
 {
   template <class _Tuple0>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  typename __tuple_cat_return_ref<tuple<_Types...>&&, _Tuple0&&>::type
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename __tuple_cat_return_ref<tuple<_Types...>&&, _Tuple0&&>::type
   operator()(tuple<_Types...> __t, _Tuple0&& __t0)
   {
     (void) __t;
@@ -1292,7 +1276,7 @@ struct __tuple_cat<tuple<_Types...>, __tuple_indices<_I0...>, __tuple_indices<_J
   }
 
   template <class _Tuple0, class _Tuple1, class... _Tuples>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
   typename __tuple_cat_return_ref<tuple<_Types...>&&, _Tuple0&&, _Tuple1&&, _Tuples&&...>::type
   operator()(tuple<_Types...> __t, _Tuple0&& __t0, _Tuple1&& __t1, _Tuples&&... __tpls)
   {
@@ -1310,7 +1294,7 @@ struct __tuple_cat<tuple<_Types...>, __tuple_indices<_I0...>, __tuple_indices<_J
 };
 
 template <class _Tuple0, class... _Tuples>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 typename __tuple_cat_return<_Tuple0, _Tuples...>::type
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 typename __tuple_cat_return<_Tuple0, _Tuples...>::type
 tuple_cat(_Tuple0&& __t0, _Tuples&&... __tpls)
 {
   typedef _LIBCUDACXX_NODEBUG_TYPE __libcpp_remove_reference_t<_Tuple0> _T0;
@@ -1319,12 +1303,12 @@ tuple_cat(_Tuple0&& __t0, _Tuples&&... __tpls)
 }
 
 template <class... _Tp, class _Alloc>
-struct _LIBCUDACXX_TEMPLATE_VIS uses_allocator<tuple<_Tp...>, _Alloc> : true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT uses_allocator<tuple<_Tp...>, _Alloc> : true_type
 {};
 
 template <class _T1, class _T2, bool _IsRef>
 template <class... _Args1, class... _Args2, size_t... _I1, size_t... _I2>
-inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __pair_base<_T1, _T2, _IsRef>::__pair_base(
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __pair_base<_T1, _T2, _IsRef>::__pair_base(
   piecewise_construct_t,
   tuple<_Args1...>& __first_args,
   tuple<_Args2...>& __second_args,
@@ -1342,23 +1326,24 @@ inline _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX20 __pair_base<_T1, _T2,
     }
 
 template <class _Fn, class _Tuple, size_t... _Id>
-inline _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto)
-__apply_tuple_impl(_Fn&& __f, _Tuple&& __t, __tuple_indices<_Id...>) _LIBCUDACXX_NOEXCEPT_RETURN(
-  _CUDA_VSTD::__invoke(_CUDA_VSTD::forward<_Fn>(__f), _CUDA_VSTD::get<_Id>(_CUDA_VSTD::forward<_Tuple>(__t))...))
-
-  template <class _Fn, class _Tuple>
-  inline _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto) apply(_Fn&& __f, _Tuple&& __t)
-    _LIBCUDACXX_NOEXCEPT_RETURN(_CUDA_VSTD::__apply_tuple_impl(
-      _CUDA_VSTD::forward<_Fn>(__f),
-      _CUDA_VSTD::forward<_Tuple>(__t),
-      __make_tuple_indices_t<tuple_size_v<remove_reference_t<_Tuple>>>{}))
-
-      template <class _Tp, class _Tuple, size_t... _Idx>
-      inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp __make_from_tuple_impl(_Tuple&& __t, __tuple_indices<_Idx...>)
-        _LIBCUDACXX_NOEXCEPT_RETURN(_Tp(_CUDA_VSTD::get<_Idx>(_CUDA_VSTD::forward<_Tuple>(__t))...))
-
-          template <class _Tp, class _Tuple>
-          inline _LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp
+_LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) __apply_tuple_impl(_Fn&& __f, _Tuple&& __t, __tuple_indices<_Id...>)
+  _LIBCUDACXX_NOEXCEPT_RETURN(
+    _CUDA_VSTD::__invoke(_CUDA_VSTD::forward<_Fn>(__f), _CUDA_VSTD::get<_Id>(_CUDA_VSTD::forward<_Tuple>(__t))...))
+
+    template <class _Fn, class _Tuple>
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) apply(_Fn&& __f, _Tuple&& __t)
+      _LIBCUDACXX_NOEXCEPT_RETURN(_CUDA_VSTD::__apply_tuple_impl(
+        _CUDA_VSTD::forward<_Fn>(__f),
+        _CUDA_VSTD::forward<_Tuple>(__t),
+        __make_tuple_indices_t<tuple_size_v<remove_reference_t<_Tuple>>>{}))
+
+        template <class _Tp, class _Tuple, size_t... _Idx>
+        _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp
+  __make_from_tuple_impl(_Tuple&& __t, __tuple_indices<_Idx...>)
+    _LIBCUDACXX_NOEXCEPT_RETURN(_Tp(_CUDA_VSTD::get<_Idx>(_CUDA_VSTD::forward<_Tuple>(__t))...))
+
+      template <class _Tp, class _Tuple>
+      _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp
   make_from_tuple(_Tuple&& __t) _LIBCUDACXX_NOEXCEPT_RETURN(_CUDA_VSTD::__make_from_tuple_impl<_Tp>(
     _CUDA_VSTD::forward<_Tuple>(__t), __make_tuple_indices_t<tuple_size_v<remove_reference_t<_Tuple>>>{}))
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/type_traits b/libcudacxx/include/cuda/std/detail/libcxx/include/type_traits
deleted file mode 100644
index b5d6ce7cc0..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/type_traits
+++ /dev/null
@@ -1,657 +0,0 @@
-// -*- C++ -*-
-//===------------------------ type_traits ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_TYPE_TRAITS
-#define _LIBCUDACXX_TYPE_TRAITS
-
-/*
-    type_traits synopsis
-
-namespace std
-{
-
-    // helper class:
-    template <class T, T v> struct integral_constant;
-    typedef integral_constant<bool, true>  true_type;   // C++11
-    typedef integral_constant<bool, false> false_type;  // C++11
-
-    template <bool B>                                   // C++14
-    using bool_constant = integral_constant<bool, B>;   // C++14
-    typedef bool_constant<true> true_type;              // C++14
-    typedef bool_constant<false> false_type;            // C++14
-
-    // helper traits
-    template <bool, class T = void> struct enable_if;
-    template <bool, class T, class F> struct conditional;
-
-    // Primary classification traits:
-    template <class T> struct is_void;
-    template <class T> struct is_null_pointer;  // C++14
-    template <class T> struct is_integral;
-    template <class T> struct is_floating_point;
-    template <class T> struct is_array;
-    template <class T> struct is_pointer;
-    template <class T> struct is_lvalue_reference;
-    template <class T> struct is_rvalue_reference;
-    template <class T> struct is_member_object_pointer;
-    template <class T> struct is_member_function_pointer;
-    template <class T> struct is_enum;
-    template <class T> struct is_union;
-    template <class T> struct is_class;
-    template <class T> struct is_function;
-
-    // Secondary classification traits:
-    template <class T> struct is_reference;
-    template <class T> struct is_arithmetic;
-    template <class T> struct is_fundamental;
-    template <class T> struct is_member_pointer;
-    template <class T> struct is_scalar;
-    template <class T> struct is_object;
-    template <class T> struct is_compound;
-
-    // Const-volatile properties and transformations:
-    template <class T> struct is_const;
-    template <class T> struct is_volatile;
-    template <class T> struct remove_const;
-    template <class T> struct remove_volatile;
-    template <class T> struct remove_cv;
-    template <class T> struct add_const;
-    template <class T> struct add_volatile;
-    template <class T> struct add_cv;
-
-    // Reference transformations:
-    template <class T> struct remove_reference;
-    template <class T> struct add_lvalue_reference;
-    template <class T> struct add_rvalue_reference;
-
-    // Pointer transformations:
-    template <class T> struct remove_pointer;
-    template <class T> struct add_pointer;
-
-    template<class T> struct type_identity;                     // C++20
-    template<class T>
-      using type_identity_t = typename type_identity<T>::type;  // C++20
-
-    // Integral properties:
-    template <class T> struct is_signed;
-    template <class T> struct is_unsigned;
-    template <class T> struct make_signed;
-    template <class T> struct make_unsigned;
-
-    // Array properties and transformations:
-    template <class T> struct rank;
-    template <class T, unsigned I = 0> struct extent;
-    template <class T> struct remove_extent;
-    template <class T> struct remove_all_extents;
-
-    template <class T> struct is_bounded_array;                 // C++20
-    template <class T> struct is_unbounded_array;               // C++20
-
-    // Member introspection:
-    template <class T> struct is_pod;
-    template <class T> struct is_trivial;
-    template <class T> struct is_trivially_copyable;
-    template <class T> struct is_standard_layout;
-    template <class T> struct is_literal_type;
-    template <class T> struct is_empty;
-    template <class T> struct is_polymorphic;
-    template <class T> struct is_abstract;
-    template <class T> struct is_final; // C++14
-    template <class T> struct is_aggregate; // C++17
-
-    template <class T, class... Args> struct is_constructible;
-    template <class T>                struct is_default_constructible;
-    template <class T>                struct is_copy_constructible;
-    template <class T>                struct is_move_constructible;
-    template <class T, class U>       struct is_assignable;
-    template <class T>                struct is_copy_assignable;
-    template <class T>                struct is_move_assignable;
-    template <class T, class U>       struct is_swappable_with;       // C++17
-    template <class T>                struct is_swappable;            // C++17
-    template <class T>                struct is_destructible;
-
-    template <class T, class... Args> struct is_trivially_constructible;
-    template <class T>                struct is_trivially_default_constructible;
-    template <class T>                struct is_trivially_copy_constructible;
-    template <class T>                struct is_trivially_move_constructible;
-    template <class T, class U>       struct is_trivially_assignable;
-    template <class T>                struct is_trivially_copy_assignable;
-    template <class T>                struct is_trivially_move_assignable;
-    template <class T>                struct is_trivially_destructible;
-
-    template <class T, class... Args> struct is_nothrow_constructible;
-    template <class T>                struct is_nothrow_default_constructible;
-    template <class T>                struct is_nothrow_copy_constructible;
-    template <class T>                struct is_nothrow_move_constructible;
-    template <class T, class U>       struct is_nothrow_assignable;
-    template <class T>                struct is_nothrow_copy_assignable;
-    template <class T>                struct is_nothrow_move_assignable;
-    template <class T, class U>       struct is_nothrow_swappable_with; // C++17
-    template <class T>                struct is_nothrow_swappable;      // C++17
-    template <class T>                struct is_nothrow_destructible;
-
-    template <class T> struct has_virtual_destructor;
-
-    template<class T> struct has_unique_object_representations;         // C++17
-
-    // Relationships between types:
-    template <class T, class U> struct is_same;
-    template <class Base, class Derived> struct is_base_of;
-
-    template <class From, class To> struct is_convertible;
-    template <typename From, typename To> struct is_nothrow_convertible;                  // C++20
-    template <typename From, typename To> inline constexpr bool is_nothrow_convertible_v; // C++20
-
-    template <class Fn, class... ArgTypes> struct is_invocable;
-    template <class R, class Fn, class... ArgTypes> struct is_invocable_r;
-
-    template <class Fn, class... ArgTypes> struct is_nothrow_invocable;
-    template <class R, class Fn, class... ArgTypes> struct is_nothrow_invocable_r;
-
-    // Alignment properties and transformations:
-    template <class T> struct alignment_of;
-    template <size_t Len, size_t Align = most_stringent_alignment_requirement>
-        struct aligned_storage;
-    template <size_t Len, class... Types> struct aligned_union;
-    template <class T> struct remove_cvref; // C++20
-
-    template <class T> struct decay;
-    template <class... T> struct common_type;
-    template <class T> struct underlying_type;
-    template <class> class result_of; // undefined
-    template <class Fn, class... ArgTypes> class result_of<Fn(ArgTypes...)>;
-    template <class Fn, class... ArgTypes> struct invoke_result;  // C++17
-
-    // const-volatile modifications:
-    template <class T>
-      using remove_const_t    = typename remove_const<T>::type;  // C++14
-    template <class T>
-      using remove_volatile_t = typename remove_volatile<T>::type;  // C++14
-    template <class T>
-      using remove_cv_t       = typename remove_cv<T>::type;  // C++14
-    template <class T>
-      using add_const_t       = typename add_const<T>::type;  // C++14
-    template <class T>
-      using add_volatile_t    = typename add_volatile<T>::type;  // C++14
-    template <class T>
-      using add_cv_t          = typename add_cv<T>::type;  // C++14
-
-    // reference modifications:
-    template <class T>
-      using remove_reference_t     = typename remove_reference<T>::type;  // C++14
-    template <class T>
-      using add_lvalue_reference_t = typename add_lvalue_reference<T>::type;  // C++14
-    template <class T>
-      using add_rvalue_reference_t = typename add_rvalue_reference<T>::type;  // C++14
-
-    // sign modifications:
-    template <class T>
-      using make_signed_t   = typename make_signed<T>::type;  // C++14
-    template <class T>
-      using make_unsigned_t = typename make_unsigned<T>::type;  // C++14
-
-    // array modifications:
-    template <class T>
-      using remove_extent_t      = typename remove_extent<T>::type;  // C++14
-    template <class T>
-      using remove_all_extents_t = typename remove_all_extents<T>::type;  // C++14
-
-    template <class T>
-      inline constexpr bool is_bounded_array_v
-        = is_bounded_array<T>::value;                                     // C++20
-      inline constexpr bool is_unbounded_array_v
-        = is_unbounded_array<T>::value;                                   // C++20
-
-    // pointer modifications:
-    template <class T>
-      using remove_pointer_t = typename remove_pointer<T>::type;  // C++14
-    template <class T>
-      using add_pointer_t    = typename add_pointer<T>::type;  // C++14
-
-    // other transformations:
-    template <size_t Len, std::size_t Align=default-alignment>
-      using aligned_storage_t = typename aligned_storage<Len,Align>::type;  // C++14
-    template <std::size_t Len, class... Types>
-      using aligned_union_t   = typename aligned_union<Len,Types...>::type;  // C++14
-    template <class T>
-      using remove_cvref_t    = typename remove_cvref<T>::type;  // C++20
-    template <class T>
-      using decay_t           = typename decay<T>::type;  // C++14
-    template <bool b, class T=void>
-      using enable_if_t       = typename enable_if<b,T>::type;  // C++14
-    template <bool b, class T, class F>
-      using conditional_t     = typename conditional<b,T,F>::type;  // C++14
-    template <class... T>
-      using common_type_t     = typename common_type<T...>::type;  // C++14
-    template <class T>
-      using underlying_type_t = typename underlying_type<T>::type;  // C++14
-    template <class T>
-      using result_of_t       = typename result_of<T>::type;  // C++14
-    template <class Fn, class... ArgTypes>
-      using invoke_result_t   = typename invoke_result<Fn, ArgTypes...>::type;  // C++17
-
-    template <class...>
-      using void_t = void;   // C++17
-
-      // See C++14 20.10.4.1, primary type categories
-      template <class T> inline constexpr bool is_void_v
-        = is_void<T>::value;                                             // C++17
-      template <class T> inline constexpr bool is_null_pointer_v
-        = is_null_pointer<T>::value;                                     // C++17
-      template <class T> inline constexpr bool is_integral_v
-        = is_integral<T>::value;                                         // C++17
-      template <class T> inline constexpr bool is_floating_point_v
-        = is_floating_point<T>::value;                                   // C++17
-      template <class T> inline constexpr bool is_array_v
-        = is_array<T>::value;                                            // C++17
-      template <class T> inline constexpr bool is_pointer_v
-        = is_pointer<T>::value;                                          // C++17
-      template <class T> inline constexpr bool is_lvalue_reference_v
-        = is_lvalue_reference<T>::value;                                 // C++17
-      template <class T> inline constexpr bool is_rvalue_reference_v
-        = is_rvalue_reference<T>::value;                                 // C++17
-      template <class T> inline constexpr bool is_member_object_pointer_v
-        = is_member_object_pointer<T>::value;                            // C++17
-      template <class T> inline constexpr bool is_member_function_pointer_v
-        = is_member_function_pointer<T>::value;                          // C++17
-      template <class T> inline constexpr bool is_enum_v
-        = is_enum<T>::value;                                             // C++17
-      template <class T> inline constexpr bool is_union_v
-        = is_union<T>::value;                                            // C++17
-      template <class T> inline constexpr bool is_class_v
-        = is_class<T>::value;                                            // C++17
-      template <class T> inline constexpr bool is_function_v
-        = is_function<T>::value;                                         // C++17
-
-      // See C++14 20.10.4.2, composite type categories
-      template <class T> inline constexpr bool is_reference_v
-        = is_reference<T>::value;                                        // C++17
-      template <class T> inline constexpr bool is_arithmetic_v
-        = is_arithmetic<T>::value;                                       // C++17
-      template <class T> inline constexpr bool is_fundamental_v
-        = is_fundamental<T>::value;                                      // C++17
-      template <class T> inline constexpr bool is_object_v
-        = is_object<T>::value;                                           // C++17
-      template <class T> inline constexpr bool is_scalar_v
-        = is_scalar<T>::value;                                           // C++17
-      template <class T> inline constexpr bool is_compound_v
-        = is_compound<T>::value;                                         // C++17
-      template <class T> inline constexpr bool is_member_pointer_v
-        = is_member_pointer<T>::value;                                   // C++17
-
-      // See C++14 20.10.4.3, type properties
-      template <class T> inline constexpr bool is_const_v
-        = is_const<T>::value;                                            // C++17
-      template <class T> inline constexpr bool is_volatile_v
-        = is_volatile<T>::value;                                         // C++17
-      template <class T> inline constexpr bool is_trivial_v
-        = is_trivial<T>::value;                                          // C++17
-      template <class T> inline constexpr bool is_trivially_copyable_v
-        = is_trivially_copyable<T>::value;                               // C++17
-      template <class T> inline constexpr bool is_standard_layout_v
-        = is_standard_layout<T>::value;                                  // C++17
-      template <class T> inline constexpr bool is_pod_v
-        = is_pod<T>::value;                                              // C++17
-      template <class T> inline constexpr bool is_literal_type_v
-        = is_literal_type<T>::value;                                     // C++17
-      template <class T> inline constexpr bool is_empty_v
-        = is_empty<T>::value;                                            // C++17
-      template <class T> inline constexpr bool is_polymorphic_v
-        = is_polymorphic<T>::value;                                      // C++17
-      template <class T> inline constexpr bool is_abstract_v
-        = is_abstract<T>::value;                                         // C++17
-      template <class T> inline constexpr bool is_final_v
-        = is_final<T>::value;                                            // C++17
-      template <class T> inline constexpr bool is_aggregate_v
-        = is_aggregate<T>::value;                                        // C++17
-      template <class T> inline constexpr bool is_signed_v
-        = is_signed<T>::value;                                           // C++17
-      template <class T> inline constexpr bool is_unsigned_v
-        = is_unsigned<T>::value;                                         // C++17
-      template <class T, class... Args> inline constexpr bool is_constructible_v
-        = is_constructible<T, Args...>::value;                           // C++17
-      template <class T> inline constexpr bool is_default_constructible_v
-        = is_default_constructible<T>::value;                            // C++17
-      template <class T> inline constexpr bool is_copy_constructible_v
-        = is_copy_constructible<T>::value;                               // C++17
-      template <class T> inline constexpr bool is_move_constructible_v
-        = is_move_constructible<T>::value;                               // C++17
-      template <class T, class U> inline constexpr bool is_assignable_v
-        = is_assignable<T, U>::value;                                    // C++17
-      template <class T> inline constexpr bool is_copy_assignable_v
-        = is_copy_assignable<T>::value;                                  // C++17
-      template <class T> inline constexpr bool is_move_assignable_v
-        = is_move_assignable<T>::value;                                  // C++17
-      template <class T, class U> inline constexpr bool is_swappable_with_v
-        = is_swappable_with<T, U>::value;                                // C++17
-      template <class T> inline constexpr bool is_swappable_v
-        = is_swappable<T>::value;                                        // C++17
-      template <class T> inline constexpr bool is_destructible_v
-        = is_destructible<T>::value;                                     // C++17
-      template <class T, class... Args> inline constexpr bool is_trivially_constructible_v
-        = is_trivially_constructible<T, Args...>::value;                 // C++17
-      template <class T> inline constexpr bool is_trivially_default_constructible_v
-        = is_trivially_default_constructible<T>::value;                  // C++17
-      template <class T> inline constexpr bool is_trivially_copy_constructible_v
-        = is_trivially_copy_constructible<T>::value;                     // C++17
-      template <class T> inline constexpr bool is_trivially_move_constructible_v
-        = is_trivially_move_constructible<T>::value;                     // C++17
-      template <class T, class U> inline constexpr bool is_trivially_assignable_v
-        = is_trivially_assignable<T, U>::value;                          // C++17
-      template <class T> inline constexpr bool is_trivially_copy_assignable_v
-        = is_trivially_copy_assignable<T>::value;                        // C++17
-      template <class T> inline constexpr bool is_trivially_move_assignable_v
-        = is_trivially_move_assignable<T>::value;                        // C++17
-      template <class T> inline constexpr bool is_trivially_destructible_v
-        = is_trivially_destructible<T>::value;                           // C++17
-      template <class T, class... Args> inline constexpr bool is_nothrow_constructible_v
-        = is_nothrow_constructible<T, Args...>::value;                   // C++17
-      template <class T> inline constexpr bool is_nothrow_default_constructible_v
-        = is_nothrow_default_constructible<T>::value;                    // C++17
-      template <class T> inline constexpr bool is_nothrow_copy_constructible_v
-        = is_nothrow_copy_constructible<T>::value;                       // C++17
-      template <class T> inline constexpr bool is_nothrow_move_constructible_v
-        = is_nothrow_move_constructible<T>::value;                       // C++17
-      template <class T, class U> inline constexpr bool is_nothrow_assignable_v
-        = is_nothrow_assignable<T, U>::value;                            // C++17
-      template <class T> inline constexpr bool is_nothrow_copy_assignable_v
-        = is_nothrow_copy_assignable<T>::value;                          // C++17
-      template <class T> inline constexpr bool is_nothrow_move_assignable_v
-        = is_nothrow_move_assignable<T>::value;                          // C++17
-      template <class T, class U> inline constexpr bool is_nothrow_swappable_with_v
-        = is_nothrow_swappable_with<T, U>::value;                       // C++17
-      template <class T> inline constexpr bool is_nothrow_swappable_v
-        = is_nothrow_swappable<T>::value;                               // C++17
-      template <class T> inline constexpr bool is_nothrow_destructible_v
-        = is_nothrow_destructible<T>::value;                             // C++17
-      template <class T> inline constexpr bool has_virtual_destructor_v
-        = has_virtual_destructor<T>::value;                              // C++17
-      template<class T> inline constexpr bool has_unique_object_representations_v // C++17
-        = has_unique_object_representations<T>::value;
-
-      // See C++14 20.10.5, type property queries
-      template <class T> inline constexpr size_t alignment_of_v
-        = alignment_of<T>::value;                                        // C++17
-      template <class T> inline constexpr size_t rank_v
-        = rank<T>::value;                                                // C++17
-      template <class T, unsigned I = 0> inline constexpr size_t extent_v
-        = extent<T, I>::value;                                           // C++17
-
-      // See C++14 20.10.6, type relations
-      template <class T, class U> inline constexpr bool is_same_v
-        = is_same<T, U>::value;                                          // C++17
-      template <class Base, class Derived> inline constexpr bool is_base_of_v
-        = is_base_of<Base, Derived>::value;                              // C++17
-      template <class From, class To> inline constexpr bool is_convertible_v
-        = is_convertible<From, To>::value;                               // C++17
-      template <class Fn, class... ArgTypes> inline constexpr bool is_invocable_v
-        = is_invocable<Fn, ArgTypes...>::value;                          // C++17
-      template <class R, class Fn, class... ArgTypes> inline constexpr bool is_invocable_r_v
-        = is_invocable_r<R, Fn, ArgTypes...>::value;                     // C++17
-      template <class Fn, class... ArgTypes> inline constexpr bool is_nothrow_invocable_v
-        = is_nothrow_invocable<Fn, ArgTypes...>::value;                  // C++17
-      template <class R, class Fn, class... ArgTypes> inline constexpr bool is_nothrow_invocable_r_v
-        = is_nothrow_invocable_r<R, Fn, ArgTypes...>::value;             // C++17
-
-      // [meta.logical], logical operator traits:
-      template<class... B> struct conjunction;                           // C++17
-      template<class... B>
-        inline constexpr bool conjunction_v = conjunction<B...>::value;  // C++17
-      template<class... B> struct disjunction;                           // C++17
-      template<class... B>
-        inline constexpr bool disjunction_v = disjunction<B...>::value;  // C++17
-      template<class B> struct negation;                                 // C++17
-      template<class B>
-        inline constexpr bool negation_v = negation<B>::value;           // C++17
-
-}
-
-*/
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__algorithm/iter_swap.h>
-#include <cuda/std/__functional/identity.h>
-#include <cuda/std/__functional/invoke.h>
-#include <cuda/std/__fwd/pair.h>
-#include <cuda/std/__memory/addressof.h>
-#include <cuda/std/__type_traits/add_const.h>
-#include <cuda/std/__type_traits/add_cv.h>
-#include <cuda/std/__type_traits/add_lvalue_reference.h>
-#include <cuda/std/__type_traits/add_pointer.h>
-#include <cuda/std/__type_traits/add_rvalue_reference.h>
-#include <cuda/std/__type_traits/add_volatile.h>
-#include <cuda/std/__type_traits/aligned_storage.h>
-#include <cuda/std/__type_traits/aligned_union.h>
-#include <cuda/std/__type_traits/alignment_of.h>
-#include <cuda/std/__type_traits/apply_cv.h>
-#include <cuda/std/__type_traits/can_extract_key.h>
-#include <cuda/std/__type_traits/common_reference.h>
-#include <cuda/std/__type_traits/common_type.h>
-#include <cuda/std/__type_traits/conditional.h>
-#include <cuda/std/__type_traits/conjunction.h>
-#include <cuda/std/__type_traits/copy_cv.h>
-#include <cuda/std/__type_traits/copy_cvref.h>
-#include <cuda/std/__type_traits/decay.h>
-#include <cuda/std/__type_traits/dependent_type.h>
-#include <cuda/std/__type_traits/disjunction.h>
-#include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/extent.h>
-#include <cuda/std/__type_traits/has_unique_object_representation.h>
-#include <cuda/std/__type_traits/has_virtual_destructor.h>
-#include <cuda/std/__type_traits/integral_constant.h>
-#include <cuda/std/__type_traits/is_abstract.h>
-#include <cuda/std/__type_traits/is_aggregate.h>
-#include <cuda/std/__type_traits/is_allocator.h>
-#include <cuda/std/__type_traits/is_arithmetic.h>
-#include <cuda/std/__type_traits/is_array.h>
-#include <cuda/std/__type_traits/is_assignable.h>
-#include <cuda/std/__type_traits/is_base_of.h>
-#include <cuda/std/__type_traits/is_bounded_array.h>
-#include <cuda/std/__type_traits/is_callable.h>
-#include <cuda/std/__type_traits/is_char_like_type.h>
-#include <cuda/std/__type_traits/is_class.h>
-#include <cuda/std/__type_traits/is_compound.h>
-#include <cuda/std/__type_traits/is_const.h>
-#include <cuda/std/__type_traits/is_constant_evaluated.h>
-#include <cuda/std/__type_traits/is_constructible.h>
-#include <cuda/std/__type_traits/is_convertible.h>
-#include <cuda/std/__type_traits/is_copy_assignable.h>
-#include <cuda/std/__type_traits/is_copy_constructible.h>
-#include <cuda/std/__type_traits/is_core_convertible.h>
-#include <cuda/std/__type_traits/is_default_constructible.h>
-#include <cuda/std/__type_traits/is_destructible.h>
-#include <cuda/std/__type_traits/is_empty.h>
-#include <cuda/std/__type_traits/is_enum.h>
-#include <cuda/std/__type_traits/is_final.h>
-#include <cuda/std/__type_traits/is_floating_point.h>
-#include <cuda/std/__type_traits/is_function.h>
-#include <cuda/std/__type_traits/is_fundamental.h>
-#include <cuda/std/__type_traits/is_implicitly_default_constructible.h>
-#include <cuda/std/__type_traits/is_integral.h>
-#include <cuda/std/__type_traits/is_literal_type.h>
-#include <cuda/std/__type_traits/is_member_function_pointer.h>
-#include <cuda/std/__type_traits/is_member_object_pointer.h>
-#include <cuda/std/__type_traits/is_member_pointer.h>
-#include <cuda/std/__type_traits/is_move_assignable.h>
-#include <cuda/std/__type_traits/is_move_constructible.h>
-#include <cuda/std/__type_traits/is_nothrow_assignable.h>
-#include <cuda/std/__type_traits/is_nothrow_constructible.h>
-#include <cuda/std/__type_traits/is_nothrow_convertible.h>
-#include <cuda/std/__type_traits/is_nothrow_copy_assignable.h>
-#include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
-#include <cuda/std/__type_traits/is_nothrow_default_constructible.h>
-#include <cuda/std/__type_traits/is_nothrow_destructible.h>
-#include <cuda/std/__type_traits/is_nothrow_move_assignable.h>
-#include <cuda/std/__type_traits/is_nothrow_move_constructible.h>
-#include <cuda/std/__type_traits/is_null_pointer.h>
-#include <cuda/std/__type_traits/is_object.h>
-#include <cuda/std/__type_traits/is_pod.h>
-#include <cuda/std/__type_traits/is_pointer.h>
-#include <cuda/std/__type_traits/is_polymorphic.h>
-#include <cuda/std/__type_traits/is_primary_template.h>
-#include <cuda/std/__type_traits/is_reference.h>
-#include <cuda/std/__type_traits/is_reference_wrapper.h>
-#include <cuda/std/__type_traits/is_referenceable.h>
-#include <cuda/std/__type_traits/is_same.h>
-#include <cuda/std/__type_traits/is_scalar.h>
-#include <cuda/std/__type_traits/is_scoped_enum.h>
-#include <cuda/std/__type_traits/is_signed.h>
-#include <cuda/std/__type_traits/is_signed_integer.h>
-#include <cuda/std/__type_traits/is_standard_layout.h>
-#include <cuda/std/__type_traits/is_swappable.h>
-#include <cuda/std/__type_traits/is_trivial.h>
-#include <cuda/std/__type_traits/is_trivially_assignable.h>
-#include <cuda/std/__type_traits/is_trivially_constructible.h>
-#include <cuda/std/__type_traits/is_trivially_copy_assignable.h>
-#include <cuda/std/__type_traits/is_trivially_copy_constructible.h>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
-#include <cuda/std/__type_traits/is_trivially_default_constructible.h>
-#include <cuda/std/__type_traits/is_trivially_destructible.h>
-#include <cuda/std/__type_traits/is_trivially_move_assignable.h>
-#include <cuda/std/__type_traits/is_trivially_move_constructible.h>
-#include <cuda/std/__type_traits/is_unbounded_array.h>
-#include <cuda/std/__type_traits/is_union.h>
-#include <cuda/std/__type_traits/is_unsigned.h>
-#include <cuda/std/__type_traits/is_unsigned_integer.h>
-#include <cuda/std/__type_traits/is_valid_expansion.h>
-#include <cuda/std/__type_traits/is_void.h>
-#include <cuda/std/__type_traits/is_volatile.h>
-#include <cuda/std/__type_traits/lazy.h>
-#include <cuda/std/__type_traits/make_32_64_or_128_bit.h>
-#include <cuda/std/__type_traits/make_const_lvalue_ref.h>
-#include <cuda/std/__type_traits/make_signed.h>
-#include <cuda/std/__type_traits/make_unsigned.h>
-#include <cuda/std/__type_traits/maybe_const.h>
-#include <cuda/std/__type_traits/nat.h>
-#include <cuda/std/__type_traits/negation.h>
-#include <cuda/std/__type_traits/promote.h>
-#include <cuda/std/__type_traits/rank.h>
-#include <cuda/std/__type_traits/remove_all_extents.h>
-#include <cuda/std/__type_traits/remove_const.h>
-#include <cuda/std/__type_traits/remove_const_ref.h>
-#include <cuda/std/__type_traits/remove_cv.h>
-#include <cuda/std/__type_traits/remove_cvref.h>
-#include <cuda/std/__type_traits/remove_extent.h>
-#include <cuda/std/__type_traits/remove_pointer.h>
-#include <cuda/std/__type_traits/remove_reference.h>
-#include <cuda/std/__type_traits/remove_volatile.h>
-#include <cuda/std/__type_traits/result_of.h>
-#include <cuda/std/__type_traits/type_identity.h>
-#include <cuda/std/__type_traits/type_list.h>
-#include <cuda/std/__type_traits/underlying_type.h>
-#include <cuda/std/__type_traits/void_t.h>
-#include <cuda/std/__utility/convert_to_integral.h>
-#include <cuda/std/__utility/declval.h>
-#include <cuda/std/__utility/forward.h>
-#include <cuda/std/__utility/move.h>
-#include <cuda/std/__utility/swap.h>
-#include <cuda/std/cstddef>
-#include <cuda/std/cstdint>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/version>
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS hash;
-
-template <bool>
-struct _MetaBase;
-template <>
-struct _MetaBase<true>
-{
-  template <class _Tp, class _Up>
-  using _SelectImpl _LIBCUDACXX_NODEBUG_TYPE = _Tp;
-  template <template <class...> class _FirstFn, template <class...> class, class... _Args>
-  using _SelectApplyImpl _LIBCUDACXX_NODEBUG_TYPE = _FirstFn<_Args...>;
-  template <class _First, class...>
-  using _FirstImpl _LIBCUDACXX_NODEBUG_TYPE = _First;
-  template <class, class _Second, class...>
-  using _SecondImpl _LIBCUDACXX_NODEBUG_TYPE = _Second;
-  template <class _Tp = void>
-  using _EnableIfImpl _LIBCUDACXX_NODEBUG_TYPE = _Tp;
-  template <class _Result, class _First, class... _Rest>
-  using _OrImpl _LIBCUDACXX_NODEBUG_TYPE =
-    typename _MetaBase<_First::value != true && sizeof...(_Rest) != 0>::template _OrImpl<_First, _Rest...>;
-  template <class _Result, class _First, class... _Rest>
-  using _AndImpl _LIBCUDACXX_NODEBUG_TYPE =
-    typename _MetaBase<_First::value == true && sizeof...(_Rest) != 0>::template _AndImpl<_First, _Rest...>;
-};
-
-template <>
-struct _MetaBase<false>
-{
-  template <class _Tp, class _Up>
-  using _SelectImpl _LIBCUDACXX_NODEBUG_TYPE = _Up;
-  template <template <class...> class, template <class...> class _SecondFn, class... _Args>
-  using _SelectApplyImpl _LIBCUDACXX_NODEBUG_TYPE = _SecondFn<_Args...>;
-  template <class _Result, class...>
-  using _OrImpl _LIBCUDACXX_NODEBUG_TYPE = _Result;
-  template <class _Result, class...>
-  using _AndImpl _LIBCUDACXX_NODEBUG_TYPE = _Result;
-};
-template <bool _Cond, class _Ret = void>
-using _EnableIf _LIBCUDACXX_NODEBUG_TYPE = typename _MetaBase<_Cond>::template _EnableIfImpl<_Ret>;
-template <class... _Args>
-using _FirstType _LIBCUDACXX_NODEBUG_TYPE = typename _MetaBase<(sizeof...(_Args) >= 1)>::template _FirstImpl<_Args...>;
-template <class... _Args>
-using _SecondType _LIBCUDACXX_NODEBUG_TYPE =
-  typename _MetaBase<(sizeof...(_Args) >= 2)>::template _SecondImpl<_Args...>;
-
-// helper class:
-
-template <class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY __decay_t<_Tp> __decay_copy(_Tp&& __t)
-{
-  return _CUDA_VSTD::forward<_Tp>(__t);
-}
-
-template <class _Tp>
-struct __has_operator_addressof_member_imp
-{
-  template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static auto __test(int) ->
-    typename __select_2nd<decltype(_CUDA_VSTD::declval<_Up>().operator&()), true_type>::type;
-  template <class>
-  _LIBCUDACXX_INLINE_VISIBILITY static auto __test(long) -> false_type;
-
-  static const bool value = decltype(__test<_Tp>(0))::value;
-};
-
-template <class _Tp>
-struct __has_operator_addressof_free_imp
-{
-  template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY static auto __test(int) ->
-    typename __select_2nd<decltype(operator&(_CUDA_VSTD::declval<_Up>())), true_type>::type;
-  template <class>
-  _LIBCUDACXX_INLINE_VISIBILITY static auto __test(long) -> false_type;
-
-  static const bool value = decltype(__test<_Tp>(0))::value;
-};
-
-template <class _Tp>
-struct __has_operator_addressof
-    : public integral_constant<bool,
-                               __has_operator_addressof_member_imp<_Tp>::value
-                                 || __has_operator_addressof_free_imp<_Tp>::value>
-{};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#endif // _LIBCUDACXX_TYPE_TRAITS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/utility b/libcudacxx/include/cuda/std/detail/libcxx/include/utility
deleted file mode 100644
index da9973b71c..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/utility
+++ /dev/null
@@ -1,72 +0,0 @@
-// -*- C++ -*-
-//===-------------------------- utility -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_UTILITY
-#define _LIBCUDACXX_UTILITY
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__functional/binary_function.h>
-#include <cuda/std/__functional/hash.h>
-#include <cuda/std/__functional/reference_wrapper.h>
-#include <cuda/std/__functional/unary_function.h>
-#include <cuda/std/__functional/unwrap_ref.h>
-#include <cuda/std/__functional/weak_result_type.h>
-#include <cuda/std/__fwd/get.h>
-#include <cuda/std/__memory/construct_at.h>
-#include <cuda/std/__memory/voidify.h>
-#include <cuda/std/__tuple_dir/sfinae_helpers.h>
-#include <cuda/std/__tuple_dir/structured_bindings.h>
-#include <cuda/std/__utility/as_const.h>
-#include <cuda/std/__utility/auto_cast.h>
-#include <cuda/std/__utility/cmp.h>
-#include <cuda/std/__utility/convert_to_integral.h>
-#include <cuda/std/__utility/declval.h>
-#include <cuda/std/__utility/exchange.h>
-#include <cuda/std/__utility/forward.h>
-#include <cuda/std/__utility/forward_like.h>
-#include <cuda/std/__utility/in_place.h>
-#include <cuda/std/__utility/integer_sequence.h>
-#include <cuda/std/__utility/move.h>
-#include <cuda/std/__utility/pair.h>
-#include <cuda/std/__utility/piecewise_construct.h>
-#include <cuda/std/__utility/priority_tag.h>
-#include <cuda/std/__utility/rel_ops.h>
-#include <cuda/std/__utility/swap.h>
-#include <cuda/std/__utility/to_underlying.h>
-#include <cuda/std/__utility/unreachable.h>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-#include <cuda/std/version>
-
-// standard-mandated includes
-#include <cuda/std/concepts>
-#include <cuda/std/version>
-
-// [utility.syn]
-#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-#  include <cuda/std/compare>
-#endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-#include <cuda/std/initializer_list>
-
-// [tuple.helper]
-#include <cuda/std/__tuple_dir/tuple_element.h>
-#include <cuda/std/__tuple_dir/tuple_size.h>
-
-#endif // _LIBCUDACXX_UTILITY
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/variant b/libcudacxx/include/cuda/std/detail/libcxx/include/variant
index 96331e8158..fccb314034 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/variant
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/variant
@@ -257,7 +257,7 @@ C++20
 #include <cuda/std/__utility/swap.h>
 #include <cuda/std/__variant/monostate.h>
 #include <cuda/std/cstddef>
-#include <cuda/std/detail/libcxx/include/cstdlib>
+#include <cuda/std/cstdlib>
 #include <cuda/std/initializer_list>
 #include <cuda/std/tuple>
 #include <cuda/std/version>
@@ -267,7 +267,7 @@ C++20
 #  include <cuda/std/detail/libcxx/include/compare>
 #endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
-#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#ifndef _CCCL_NO_EXCEPTIONS
 #  ifdef __cpp_lib_variant
 #    include <variant>
 #  else // ^^^ __cpp_lib_variant ^^^ / vvv !__cpp_lib_variant vvv
@@ -281,7 +281,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION
 using ::std::bad_variant_access;
 
 #  else // ^^^ __cpp_lib_variant ^^^ / vvv !__cpp_lib_variant vvv
-class _LIBCUDACXX_TYPE_VIS _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS bad_variant_access : public ::std::exception
+class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS bad_variant_access
+    : public ::std::exception
 {
 public:
   const char* what() const noexcept override
@@ -293,67 +294,69 @@ public:
 
 _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION
 
-#endif // _LIBCUDACXX_NO_EXCEPTIONS
+#endif // !_CCCL_NO_EXCEPTIONS
 
 #if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_CCCL_NORETURN inline _LIBCUDACXX_INLINE_VISIBILITY void __throw_bad_variant_access()
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __throw_bad_variant_access()
 {
-#  ifndef _LIBCUDACXX_NO_EXCEPTIONS
+#  ifndef _CCCL_NO_EXCEPTIONS
   NV_IF_ELSE_TARGET(
     NV_IS_HOST, (throw _CUDA_VSTD_NOVERSION::bad_variant_access();), (_CUDA_VSTD_NOVERSION::terminate();))
-#  else
+#  else // ^^^ _CCCL_NO_EXCEPTIONS ^^^ / vvv !_CCCL_NO_EXCEPTIONS vvv
   _CUDA_VSTD_NOVERSION::terminate();
-#  endif // !_LIBCUDACXX_NO_EXCEPTIONS
+#  endif // !_CCCL_NO_EXCEPTIONS
 }
 
 template <class... _Types>
-class _LIBCUDACXX_TEMPLATE_VIS variant;
+class _CCCL_TYPE_VISIBILITY_DEFAULT variant;
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_size;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_size;
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VAR constexpr size_t variant_size_v = variant_size<_Tp>::value;
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_size<const _Tp> : variant_size<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_size<const _Tp> : variant_size<_Tp>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_size<volatile _Tp> : variant_size<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_size<volatile _Tp> : variant_size<_Tp>
 {};
 
 template <class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_size<const volatile _Tp> : variant_size<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_size<const volatile _Tp> : variant_size<_Tp>
 {};
 
 template <class... _Types>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_size<variant<_Types...>> : integral_constant<size_t, sizeof...(_Types)>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_size<variant<_Types...>> : integral_constant<size_t, sizeof...(_Types)>
 {};
 
 template <size_t _Ip, class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_alternative;
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_alternative;
 
 template <size_t _Ip, class _Tp>
 using variant_alternative_t = typename variant_alternative<_Ip, _Tp>::type;
 
 template <size_t _Ip, class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_alternative<_Ip, const _Tp> : add_const<variant_alternative_t<_Ip, _Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_alternative<_Ip, const _Tp> : add_const<variant_alternative_t<_Ip, _Tp>>
 {};
 
 template <size_t _Ip, class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_alternative<_Ip, volatile _Tp> : add_volatile<variant_alternative_t<_Ip, _Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_alternative<_Ip, volatile _Tp>
+    : add_volatile<variant_alternative_t<_Ip, _Tp>>
 {};
 
 template <size_t _Ip, class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_alternative<_Ip, const volatile _Tp> : add_cv<variant_alternative_t<_Ip, _Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_alternative<_Ip, const volatile _Tp>
+    : add_cv<variant_alternative_t<_Ip, _Tp>>
 {};
 
 template <size_t _Ip, class... _Types>
-struct _LIBCUDACXX_TEMPLATE_VIS variant_alternative<_Ip, variant<_Types...>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT variant_alternative<_Ip, variant<_Types...>>
 {
   static_assert(_Ip < sizeof...(_Types), "Index out of bounds in _CUDA_VSTD::variant_alternative<>");
   using type = __type_pack_element<_Ip, _Types...>;
@@ -361,7 +364,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS variant_alternative<_Ip, variant<_Types...>>
 
 _LIBCUDACXX_INLINE_VAR constexpr size_t variant_npos = static_cast<size_t>(-1);
 
-_LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 int __choose_index_type(unsigned int __num_elem)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 int __choose_index_type(unsigned int __num_elem)
 {
   constexpr unsigned char __small   = static_cast<unsigned char>(-1);
   constexpr unsigned short __medium = static_cast<unsigned short>(-1);
@@ -390,28 +393,28 @@ template <class _IndexType>
 constexpr _IndexType __variant_npos = static_cast<_IndexType>(-1);
 
 template <class... _Types>
-class _LIBCUDACXX_TEMPLATE_VIS variant;
+class _CCCL_TYPE_VISIBILITY_DEFAULT variant;
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr variant<_Types...>& __as_variant(variant<_Types...>& __vs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr variant<_Types...>& __as_variant(variant<_Types...>& __vs) noexcept
 {
   return __vs;
 }
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const variant<_Types...>& __as_variant(const variant<_Types...>& __vs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr const variant<_Types...>& __as_variant(const variant<_Types...>& __vs) noexcept
 {
   return __vs;
 }
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr variant<_Types...>&& __as_variant(variant<_Types...>&& __vs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr variant<_Types...>&& __as_variant(variant<_Types...>&& __vs) noexcept
 {
   return _CUDA_VSTD::move(__vs);
 }
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const variant<_Types...>&& __as_variant(const variant<_Types...>&& __vs) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr const variant<_Types...>&& __as_variant(const variant<_Types...>&& __vs) noexcept
 {
   return _CUDA_VSTD::move(__vs);
 }
@@ -420,7 +423,7 @@ namespace __find_detail
 {
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr size_t __find_index()
+_LIBCUDACXX_HIDE_FROM_ABI constexpr size_t __find_index()
 {
   constexpr bool __matches[] = {_CCCL_TRAIT(is_same, _Tp, _Types)...};
   size_t __result            = __not_found;
@@ -476,7 +479,7 @@ constexpr _Trait __trait =
     ? _Trait::_Available
     : _Trait::_Unavailable;
 
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Trait __common_trait(initializer_list<_Trait> __traits)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Trait __common_trait(initializer_list<_Trait> __traits)
 {
   _Trait __result = _Trait::_TriviallyAvailable;
   for (_Trait __t : __traits)
@@ -514,13 +517,13 @@ namespace __access
 struct __union
 {
   template <class _Vp>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto&& __get_alt(_Vp&& __v, in_place_index_t<0>)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto&& __get_alt(_Vp&& __v, in_place_index_t<0>)
   {
     return _CUDA_VSTD::forward<_Vp>(__v).__head;
   }
 
   template <class _Vp, size_t _Ip>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto&& __get_alt(_Vp&& __v, in_place_index_t<_Ip>)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto&& __get_alt(_Vp&& __v, in_place_index_t<_Ip>)
   {
     return __get_alt(_CUDA_VSTD::forward<_Vp>(__v).__tail, in_place_index<_Ip - 1>);
   }
@@ -529,7 +532,7 @@ struct __union
 struct __base
 {
   template <size_t _Ip, class _Vp>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto&& __get_alt(_Vp&& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto&& __get_alt(_Vp&& __v)
   {
     return __union::__get_alt(_CUDA_VSTD::forward<_Vp>(__v).__data, in_place_index<_Ip>);
   }
@@ -538,7 +541,7 @@ struct __base
 struct __variant
 {
   template <size_t _Ip, class _Vp>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto&& __get_alt(_Vp&& __v)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto&& __get_alt(_Vp&& __v)
   {
     return __base::__get_alt<_Ip>(_CUDA_VSTD::forward<_Vp>(__v).__impl_);
   }
@@ -552,7 +555,7 @@ struct __variant
 {
   // We need to guard against the final invocation where we have processed all variants
   template <size_t _Remaining, size_t _CurrentVariant, class... _Variants, enable_if_t<_Remaining == 0, int> = 0>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr size_t
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr size_t
   __get_runtime_index(const _Variants&...) noexcept
   {
     return 0;
@@ -563,7 +566,7 @@ struct __variant
             class _Variant,
             class... _OtherVariants,
             enable_if_t<(_Remaining != 0) && (_CurrentVariant == 0), int> = 0>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr size_t
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr size_t
   __get_runtime_index(const _Variant& __v, const _OtherVariants&... __vs) noexcept
   {
     return __v.__impl_.index();
@@ -574,7 +577,7 @@ struct __variant
             class _Variant,
             class... _OtherVariants,
             enable_if_t<(_Remaining != 0) && (_CurrentVariant != 0), int> = 0>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr size_t
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr size_t
   __get_runtime_index(const _Variant& __v, const _OtherVariants&... __vs) noexcept
   {
     return __get_runtime_index<_Remaining, _CurrentVariant - 1>(__vs...);
@@ -582,7 +585,7 @@ struct __variant
 
   // Terminal function call with all indexes determined
   template <class _Visitor, class... _Vs, size_t... _ProcessedIndices>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr decltype(auto) __visit_impl(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr decltype(auto) __visit_impl(
     integer_sequence<size_t, _ProcessedIndices...>,
     integer_sequence<size_t>,
     const size_t,
@@ -600,7 +603,7 @@ struct __variant
             size_t... _ProcessedIndices,
             size_t... _UnprocessedIndices,
             enable_if_t<_CurrentIndex != 0, int> = 0>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr decltype(auto) __visit_impl(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr decltype(auto) __visit_impl(
     integer_sequence<size_t, _ProcessedIndices...>,
     integer_sequence<size_t, _CurrentIndex, _UnprocessedIndices...>,
     const size_t __current_index,
@@ -631,7 +634,7 @@ struct __variant
   _CCCL_NV_DIAG_SUPPRESS(940) // Suppress no return at end of function
   // This overload is needed to tell the compiler that the recursion is indeed limited
   template <class _Visitor, class... _Vs, size_t... _ProcessedIndices, size_t... _UnprocessedIndices>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr decltype(auto) __visit_impl(
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr decltype(auto) __visit_impl(
     integer_sequence<size_t, _ProcessedIndices...>,
     integer_sequence<size_t, 0, _UnprocessedIndices...>,
     const size_t __current_index,
@@ -655,7 +658,7 @@ struct __variant
   _CCCL_NV_DIAG_DEFAULT(940) // End suppression of no return at end of function
 
   template <class _Visitor, class... _Vs>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr decltype(auto)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr decltype(auto)
   __visit_value(_Visitor&& __visitor, _Vs&&... __vs)
   {
     // NOTE: We use a recursive implementation strategy here. That means we can omit the manual return type checks from
@@ -670,7 +673,7 @@ struct __variant
   }
 
   template <class _Rp, class _Visitor, class... _Vs>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr _Rp
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr _Rp
   __visit_value(_Visitor&& __visitor, _Vs&&... __vs)
   {
     const size_t __first_index = __get_runtime_index<sizeof...(_Vs), 0>(__vs...);
@@ -684,7 +687,7 @@ struct __variant
 
 private:
   template <class _Visitor, class... _Values>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr void __std_visit_exhaustive_visitor_check()
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr void __std_visit_exhaustive_visitor_check()
   {
     static_assert(_CCCL_TRAIT(is_invocable, _Visitor, _Values...),
                   "`std::visit` requires the visitor to be exhaustive.");
@@ -694,7 +697,7 @@ private:
   struct __value_visitor
   {
     template <class... _Alts>
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr decltype(auto) operator()(_Alts&&... __alts) const
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr decltype(auto) operator()(_Alts&&... __alts) const
     {
       __std_visit_exhaustive_visitor_check<_Visitor, decltype((_CUDA_VSTD::forward<_Alts>(__alts).__value))...>();
       return _CUDA_VSTD::__invoke(
@@ -707,7 +710,7 @@ private:
   struct __value_visitor_return_type
   {
     template <class... _Alts>
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr _Rp operator()(_Alts&&... __alts) const
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr _Rp operator()(_Alts&&... __alts) const
     {
       __std_visit_exhaustive_visitor_check<_Visitor, decltype((_CUDA_VSTD::forward<_Alts>(__alts).__value))...>();
       return _CUDA_VSTD::__invoke(
@@ -720,7 +723,7 @@ private:
   struct __value_visitor_return_type<void, _Visitor>
   {
     template <class... _Alts>
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr void operator()(_Alts&&... __alts) const
+    _LIBCUDACXX_HIDE_FROM_ABI constexpr void operator()(_Alts&&... __alts) const
     {
       __std_visit_exhaustive_visitor_check<_Visitor, decltype((_CUDA_VSTD::forward<_Alts>(__alts).__value))...>();
       _CUDA_VSTD::__invoke(_CUDA_VSTD::forward<_Visitor>(__visitor), _CUDA_VSTD::forward<_Alts>(__alts).__value...);
@@ -729,13 +732,13 @@ private:
   };
 
   template <class _Visitor>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto __make_value_visitor(_Visitor&& __visitor)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __make_value_visitor(_Visitor&& __visitor)
   {
     return __value_visitor<_Visitor>{_CUDA_VSTD::forward<_Visitor>(__visitor)};
   }
 
   template <class _Rp, class _Visitor>
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto __make_value_visitor(_Visitor&& __visitor)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __make_value_visitor(_Visitor&& __visitor)
   {
     return __value_visitor_return_type<_Rp, _Visitor>{_CUDA_VSTD::forward<_Visitor>(__visitor)};
   }
@@ -744,12 +747,12 @@ private:
 } // namespace __visitation
 
 template <size_t _Index, class _Tp>
-struct _LIBCUDACXX_TEMPLATE_VIS __alt
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __alt
 {
   using __value_type = _Tp;
 
   template <class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __alt(in_place_t, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __alt(in_place_t, _Args&&... __args)
       : __value(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
@@ -757,113 +760,113 @@ struct _LIBCUDACXX_TEMPLATE_VIS __alt
 };
 
 template <_Trait _DestructibleTrait, size_t _Index, class... _Types>
-union _LIBCUDACXX_TEMPLATE_VIS __union;
+union _CCCL_TYPE_VISIBILITY_DEFAULT __union;
 
 template <_Trait _DestructibleTrait, size_t _Index>
-union _LIBCUDACXX_TEMPLATE_VIS __union<_DestructibleTrait, _Index>
+union _CCCL_TYPE_VISIBILITY_DEFAULT __union<_DestructibleTrait, _Index>
 {};
 
-#  define _LIBCUDACXX_VARIANT_UNION_BODY(destructible_trait)                                           \
-                                                                                                       \
-  private:                                                                                             \
-    char __dummy;                                                                                      \
-    __alt<_Index, _Tp> __head;                                                                         \
-    __union<destructible_trait, _Index + 1, _Types...> __tail;                                         \
-                                                                                                       \
-    friend struct __access::__union;                                                                   \
-                                                                                                       \
-  public:                                                                                              \
-    _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __union(__valueless_t) noexcept                   \
-        : __dummy{}                                                                                    \
-    {}                                                                                                 \
-                                                                                                       \
-    template <class... _Args>                                                                          \
-    _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __union(in_place_index_t<0>, _Args&&... __args)   \
-        : __head(in_place, _CUDA_VSTD::forward<_Args>(__args)...)                                      \
-    {}                                                                                                 \
-                                                                                                       \
-    template <size_t _Ip, class... _Args>                                                              \
-    _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __union(in_place_index_t<_Ip>, _Args&&... __args) \
-        : __tail(in_place_index<_Ip - 1>, _CUDA_VSTD::forward<_Args>(__args)...)                       \
-    {}                                                                                                 \
-                                                                                                       \
-    __union(const __union&)            = default;                                                      \
-    __union(__union&&)                 = default;                                                      \
-    __union& operator=(const __union&) = default;                                                      \
-    __union& operator=(__union&&)      = default;
+#  define _LIBCUDACXX_VARIANT_UNION_BODY(destructible_trait)                                       \
+                                                                                                   \
+  private:                                                                                         \
+    char __dummy;                                                                                  \
+    __alt<_Index, _Tp> __head;                                                                     \
+    __union<destructible_trait, _Index + 1, _Types...> __tail;                                     \
+                                                                                                   \
+    friend struct __access::__union;                                                               \
+                                                                                                   \
+  public:                                                                                          \
+    _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __union(__valueless_t) noexcept                   \
+        : __dummy{}                                                                                \
+    {}                                                                                             \
+                                                                                                   \
+    template <class... _Args>                                                                      \
+    _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __union(in_place_index_t<0>, _Args&&... __args)   \
+        : __head(in_place, _CUDA_VSTD::forward<_Args>(__args)...)                                  \
+    {}                                                                                             \
+                                                                                                   \
+    template <size_t _Ip, class... _Args>                                                          \
+    _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __union(in_place_index_t<_Ip>, _Args&&... __args) \
+        : __tail(in_place_index<_Ip - 1>, _CUDA_VSTD::forward<_Args>(__args)...)                   \
+    {}                                                                                             \
+                                                                                                   \
+    _CCCL_HIDE_FROM_ABI __union(const __union&)            = default;                              \
+    _CCCL_HIDE_FROM_ABI __union(__union&&)                 = default;                              \
+    _CCCL_HIDE_FROM_ABI __union& operator=(const __union&) = default;                              \
+    _CCCL_HIDE_FROM_ABI __union& operator=(__union&&)      = default;
 
 template <size_t _Index, class _Tp, class... _Types>
-union _LIBCUDACXX_TEMPLATE_VIS __union<_Trait::_TriviallyAvailable, _Index, _Tp, _Types...>
+union _CCCL_TYPE_VISIBILITY_DEFAULT __union<_Trait::_TriviallyAvailable, _Index, _Tp, _Types...>
 {
   _LIBCUDACXX_VARIANT_UNION_BODY(_Trait::_TriviallyAvailable)
-  ~__union() = default;
+  _CCCL_HIDE_FROM_ABI ~__union() = default;
 };
 
 template <size_t _Index, class _Tp, class... _Types>
-union _LIBCUDACXX_TEMPLATE_VIS __union<_Trait::_Available, _Index, _Tp, _Types...>
+union _CCCL_TYPE_VISIBILITY_DEFAULT __union<_Trait::_Available, _Index, _Tp, _Types...>
 {
   _LIBCUDACXX_VARIANT_UNION_BODY(_Trait::_Available)
-  _LIBCUDACXX_INLINE_VISIBILITY ~__union() {}
+  _LIBCUDACXX_HIDE_FROM_ABI ~__union() {}
 };
 
 template <size_t _Index, class _Tp, class... _Types>
-union _LIBCUDACXX_TEMPLATE_VIS __union<_Trait::_Unavailable, _Index, _Tp, _Types...>
+union _CCCL_TYPE_VISIBILITY_DEFAULT __union<_Trait::_Unavailable, _Index, _Tp, _Types...>
 {
   _LIBCUDACXX_VARIANT_UNION_BODY(_Trait::_Unavailable)
-  _LIBCUDACXX_INLINE_VISIBILITY ~__union() = delete;
+  _LIBCUDACXX_HIDE_FROM_ABI ~__union() = delete;
 };
 
 #  undef _LIBCUDACXX_VARIANT_UNION_BODY
 
 template <_Trait _DestructibleTrait, class... _Types>
-class _LIBCUDACXX_TEMPLATE_VIS __base
+class _CCCL_TYPE_VISIBILITY_DEFAULT __base
 {
 public:
   using __index_t = __variant_index_t<sizeof...(_Types)>;
 
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __base(__valueless_t __tag) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __base(__valueless_t __tag) noexcept
       : __data(__tag)
       , __index(__variant_npos<__index_t>)
   {}
 
   template <size_t _Ip, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __base(in_place_index_t<_Ip>, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __base(in_place_index_t<_Ip>, _Args&&... __args)
       : __data(in_place_index<_Ip>, _CUDA_VSTD::forward<_Args>(__args)...)
       , __index(_Ip)
   {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool valueless_by_exception() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool valueless_by_exception() const noexcept
   {
     return index() == variant_npos;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t index() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t index() const noexcept
   {
     return __index == __variant_npos<__index_t> ? variant_npos : __index;
   }
 
 protected:
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr auto&& __as_base() &
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto&& __as_base() &
   {
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr auto&& __as_base() &&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto&& __as_base() &&
   {
     return _CUDA_VSTD::move(*this);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr auto&& __as_base() const&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto&& __as_base() const&
   {
     return *this;
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr auto&& __as_base() const&&
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr auto&& __as_base() const&&
   {
     return _CUDA_VSTD::move(*this);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr size_t __size()
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t __size()
   {
     return sizeof...(_Types);
   }
@@ -875,43 +878,43 @@ protected:
 };
 
 template <class _Traits, _Trait = _Traits::__destructible_trait>
-class _LIBCUDACXX_TEMPLATE_VIS __dtor;
-
-#  define _LIBCUDACXX_VARIANT_DESTRUCTOR_BODY(destructible_trait) \
-    using __base_type = __base<destructible_trait, _Types...>;    \
-    using __index_t   = typename __base_type::__index_t;          \
-                                                                  \
-  public:                                                         \
-    using __base_type::__base_type;                               \
-    using __base_type::operator=;                                 \
-                                                                  \
-    __dtor(const __dtor&)            = default;                   \
-    __dtor(__dtor&&)                 = default;                   \
-    __dtor& operator=(const __dtor&) = default;                   \
-    __dtor& operator=(__dtor&&)      = default;
+class _CCCL_TYPE_VISIBILITY_DEFAULT __dtor;
+
+#  define _LIBCUDACXX_VARIANT_DESTRUCTOR_BODY(destructible_trait)   \
+    using __base_type = __base<destructible_trait, _Types...>;      \
+    using __index_t   = typename __base_type::__index_t;            \
+                                                                    \
+  public:                                                           \
+    using __base_type::__base_type;                                 \
+    using __base_type::operator=;                                   \
+                                                                    \
+    _CCCL_HIDE_FROM_ABI __dtor(const __dtor&)            = default; \
+    _CCCL_HIDE_FROM_ABI __dtor(__dtor&&)                 = default; \
+    _CCCL_HIDE_FROM_ABI __dtor& operator=(const __dtor&) = default; \
+    _CCCL_HIDE_FROM_ABI __dtor& operator=(__dtor&&)      = default;
 
 template <class... _Types>
-class _LIBCUDACXX_TEMPLATE_VIS __dtor<__traits<_Types...>, _Trait::_TriviallyAvailable>
+class _CCCL_TYPE_VISIBILITY_DEFAULT __dtor<__traits<_Types...>, _Trait::_TriviallyAvailable>
     : public __base<_Trait::_TriviallyAvailable, _Types...>
 {
   _LIBCUDACXX_VARIANT_DESTRUCTOR_BODY(_Trait::_TriviallyAvailable)
-  ~__dtor() = default;
+  _CCCL_HIDE_FROM_ABI ~__dtor() = default;
 
 protected:
-  _LIBCUDACXX_INLINE_VISIBILITY void __destroy() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void __destroy() noexcept
   {
     this->__index = __variant_npos<__index_t>;
   }
 };
 
 template <class... _Types>
-class _LIBCUDACXX_TEMPLATE_VIS __dtor<__traits<_Types...>, _Trait::_Available>
+class _CCCL_TYPE_VISIBILITY_DEFAULT __dtor<__traits<_Types...>, _Trait::_Available>
     : public __base<_Trait::_Available, _Types...>
 {
   struct __visitor
   {
     template <class _Alt>
-    _LIBCUDACXX_INLINE_VISIBILITY void operator()(_Alt& __alt) const noexcept
+    _LIBCUDACXX_HIDE_FROM_ABI void operator()(_Alt& __alt) const noexcept
     {
       using __alt_type = __remove_cvref_t<decltype(__alt)>;
       __alt.~__alt_type();
@@ -919,13 +922,13 @@ class _LIBCUDACXX_TEMPLATE_VIS __dtor<__traits<_Types...>, _Trait::_Available>
   };
 
   _LIBCUDACXX_VARIANT_DESTRUCTOR_BODY(_Trait::_Available)
-  _LIBCUDACXX_INLINE_VISIBILITY ~__dtor() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI ~__dtor() noexcept
   {
     __destroy();
   }
 
 protected:
-  _LIBCUDACXX_INLINE_VISIBILITY void __destroy() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI void __destroy() noexcept
   {
     if (!this->valueless_by_exception())
     {
@@ -937,7 +940,7 @@ protected:
 
 private:
   template <size_t _CurrentIndex>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN void
   __destroy(integral_constant<size_t, _CurrentIndex>, const size_t __index) noexcept
   {
     if (__index == _CurrentIndex)
@@ -948,7 +951,7 @@ private:
     }
     __destroy(integral_constant<size_t, _CurrentIndex - 1>{}, __index);
   }
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN void
   __destroy(integral_constant<size_t, 0>, const size_t __index) noexcept
   {
     if (__index == 0)
@@ -963,25 +966,25 @@ private:
 };
 
 template <class... _Types>
-class _LIBCUDACXX_TEMPLATE_VIS __dtor<__traits<_Types...>, _Trait::_Unavailable>
+class _CCCL_TYPE_VISIBILITY_DEFAULT __dtor<__traits<_Types...>, _Trait::_Unavailable>
     : public __base<_Trait::_Unavailable, _Types...>
 {
   _LIBCUDACXX_VARIANT_DESTRUCTOR_BODY(_Trait::_Unavailable)
-  _LIBCUDACXX_INLINE_VISIBILITY ~__dtor() = delete;
+  _LIBCUDACXX_HIDE_FROM_ABI ~__dtor() = delete;
 
 protected:
-  _LIBCUDACXX_INLINE_VISIBILITY void __destroy() noexcept = delete;
+  _LIBCUDACXX_HIDE_FROM_ABI void __destroy() noexcept = delete;
 };
 
 #  undef _LIBCUDACXX_VARIANT_DESTRUCTOR_BODY
 
 template <class _Traits>
-class _LIBCUDACXX_TEMPLATE_VIS __ctor : public __dtor<_Traits>
+class _CCCL_TYPE_VISIBILITY_DEFAULT __ctor : public __dtor<_Traits>
 {
   using __base_type = __dtor<_Traits>;
 
   template <size_t _CurrentIndex, class _Rhs>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr void
   __generic_construct_impl(integral_constant<size_t, _CurrentIndex>, const size_t __index, __ctor& __lhs, _Rhs&& __rhs)
   {
     if (__index == _CurrentIndex)
@@ -996,7 +999,7 @@ class _LIBCUDACXX_TEMPLATE_VIS __ctor : public __dtor<_Traits>
   }
 
   template <class _Rhs>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr void
   __generic_construct_impl(integral_constant<size_t, 0>, const size_t __index, __ctor& __lhs, _Rhs&& __rhs)
   {
     if (__index == 0)
@@ -1016,14 +1019,14 @@ public:
 
 protected:
   template <size_t _Ip, class _Tp, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY static _Tp& __construct_alt(__alt<_Ip, _Tp>& __a, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI static _Tp& __construct_alt(__alt<_Ip, _Tp>& __a, _Args&&... __args)
   {
     __construct_at(_CUDA_VSTD::addressof(__a), in_place, _CUDA_VSTD::forward<_Args>(__args)...);
     return __a.__value;
   }
 
   template <class _Rhs>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static void __generic_construct(__ctor& __lhs, _Rhs&& __rhs)
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static void __generic_construct(__ctor& __lhs, _Rhs&& __rhs)
   {
     __lhs.__destroy();
     if (!__rhs.valueless_by_exception())
@@ -1037,31 +1040,32 @@ protected:
 };
 
 template <class _Traits, _Trait = _Traits::__move_constructible_trait>
-class _LIBCUDACXX_TEMPLATE_VIS __move_constructor;
-
-#  define _LIBCUDACXX_VARIANT_MOVE_CONSTRUCTOR(move_constructible_trait, move_constructor)           \
-    template <class... _Types>                                                                       \
-    class _LIBCUDACXX_TEMPLATE_VIS __move_constructor<__traits<_Types...>, move_constructible_trait> \
-        : public __ctor<__traits<_Types...>>                                                         \
-    {                                                                                                \
-      using __base_type = __ctor<__traits<_Types...>>;                                               \
-                                                                                                     \
-    public:                                                                                          \
-      using __base_type::__base_type;                                                                \
-      using __base_type::operator=;                                                                  \
-                                                                                                     \
-      __move_constructor(const __move_constructor&)            = default;                            \
-      move_constructor ~__move_constructor()                   = default;                            \
-      __move_constructor& operator=(const __move_constructor&) = default;                            \
-      __move_constructor& operator=(__move_constructor&&)      = default;                            \
+class _CCCL_TYPE_VISIBILITY_DEFAULT __move_constructor;
+
+#  define _LIBCUDACXX_VARIANT_MOVE_CONSTRUCTOR(move_constructible_trait, move_constructor)                \
+    template <class... _Types>                                                                            \
+    class _CCCL_TYPE_VISIBILITY_DEFAULT __move_constructor<__traits<_Types...>, move_constructible_trait> \
+        : public __ctor<__traits<_Types...>>                                                              \
+    {                                                                                                     \
+      using __base_type = __ctor<__traits<_Types...>>;                                                    \
+                                                                                                          \
+    public:                                                                                               \
+      using __base_type::__base_type;                                                                     \
+      using __base_type::operator=;                                                                       \
+                                                                                                          \
+      _CCCL_HIDE_FROM_ABI __move_constructor(const __move_constructor&)            = default;             \
+      _CCCL_HIDE_FROM_ABI ~__move_constructor()                                    = default;             \
+      _CCCL_HIDE_FROM_ABI __move_constructor& operator=(const __move_constructor&) = default;             \
+      _CCCL_HIDE_FROM_ABI __move_constructor& operator=(__move_constructor&&)      = default;             \
+      move_constructor                                                                                    \
     }
 
 _LIBCUDACXX_VARIANT_MOVE_CONSTRUCTOR(_Trait::_TriviallyAvailable,
-                                     __move_constructor(__move_constructor&& __that) = default;);
+                                     _CCCL_HIDE_FROM_ABI __move_constructor(__move_constructor&& __that) = default;);
 
 _LIBCUDACXX_VARIANT_MOVE_CONSTRUCTOR(
   _Trait::_Available,
-  _LIBCUDACXX_INLINE_VISIBILITY __move_constructor(__move_constructor&& __that) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI __move_constructor(__move_constructor&& __that) noexcept(
     __all<_CCCL_TRAIT(is_nothrow_move_constructible, _Types)...>::value)
   : __move_constructor(__valueless_t{}) { this->__generic_construct(*this, _CUDA_VSTD::move(__that)); });
 
@@ -1070,30 +1074,31 @@ _LIBCUDACXX_VARIANT_MOVE_CONSTRUCTOR(_Trait::_Unavailable, __move_constructor(__
 #  undef _LIBCUDACXX_VARIANT_MOVE_CONSTRUCTOR
 
 template <class _Traits, _Trait = _Traits::__copy_constructible_trait>
-class _LIBCUDACXX_TEMPLATE_VIS __copy_constructor;
-
-#  define _LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(copy_constructible_trait, copy_constructor)           \
-    template <class... _Types>                                                                       \
-    class _LIBCUDACXX_TEMPLATE_VIS __copy_constructor<__traits<_Types...>, copy_constructible_trait> \
-        : public __move_constructor<__traits<_Types...>>                                             \
-    {                                                                                                \
-      using __base_type = __move_constructor<__traits<_Types...>>;                                   \
-                                                                                                     \
-    public:                                                                                          \
-      using __base_type::__base_type;                                                                \
-      using __base_type::operator=;                                                                  \
-                                                                                                     \
-      copy_constructor __copy_constructor(__copy_constructor&&) = default;                           \
-      ~__copy_constructor()                                     = default;                           \
-      __copy_constructor& operator=(const __copy_constructor&)  = default;                           \
-      __copy_constructor& operator=(__copy_constructor&&)       = default;                           \
+class _CCCL_TYPE_VISIBILITY_DEFAULT __copy_constructor;
+
+#  define _LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(copy_constructible_trait, copy_constructor)                \
+    template <class... _Types>                                                                            \
+    class _CCCL_TYPE_VISIBILITY_DEFAULT __copy_constructor<__traits<_Types...>, copy_constructible_trait> \
+        : public __move_constructor<__traits<_Types...>>                                                  \
+    {                                                                                                     \
+      using __base_type = __move_constructor<__traits<_Types...>>;                                        \
+                                                                                                          \
+    public:                                                                                               \
+      using __base_type::__base_type;                                                                     \
+      using __base_type::operator=;                                                                       \
+                                                                                                          \
+      _CCCL_HIDE_FROM_ABI __copy_constructor(__copy_constructor&&)                 = default;             \
+      _CCCL_HIDE_FROM_ABI ~__copy_constructor()                                    = default;             \
+      _CCCL_HIDE_FROM_ABI __copy_constructor& operator=(const __copy_constructor&) = default;             \
+      _CCCL_HIDE_FROM_ABI __copy_constructor& operator=(__copy_constructor&&)      = default;             \
+      copy_constructor                                                                                    \
     }
 
-_LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(_Trait::_TriviallyAvailable,
-                                     __copy_constructor(const __copy_constructor& __that) = default;);
+_LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(
+  _Trait::_TriviallyAvailable, _CCCL_HIDE_FROM_ABI __copy_constructor(const __copy_constructor& __that) = default;);
 
 _LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(
-  _Trait::_Available, _LIBCUDACXX_INLINE_VISIBILITY __copy_constructor(const __copy_constructor& __that)
+  _Trait::_Available, _LIBCUDACXX_HIDE_FROM_ABI __copy_constructor(const __copy_constructor& __that)
   : __copy_constructor(__valueless_t{}) { this->__generic_construct(*this, __that); });
 
 _LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(_Trait::_Unavailable, __copy_constructor(const __copy_constructor&) = delete;);
@@ -1101,12 +1106,12 @@ _LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(_Trait::_Unavailable, __copy_constructor(co
 #  undef _LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR
 
 template <class _Traits>
-class _LIBCUDACXX_TEMPLATE_VIS __assignment : public __copy_constructor<_Traits>
+class _CCCL_TYPE_VISIBILITY_DEFAULT __assignment : public __copy_constructor<_Traits>
 {
   using __base_type = __copy_constructor<_Traits>;
 
   template <size_t _CurrentIndex, class _Other>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY constexpr void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN constexpr void
   __generic_assign(integral_constant<size_t, _CurrentIndex>, const size_t __index, _Other&& __rhs)
   {
     if (__index == _CurrentIndex)
@@ -1120,7 +1125,7 @@ class _LIBCUDACXX_TEMPLATE_VIS __assignment : public __copy_constructor<_Traits>
   }
 
   template <class _Other>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY constexpr void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN constexpr void
   __generic_assign(integral_constant<size_t, 0>, const size_t __index, _Other&& __rhs)
   {
     if (__index == 0)
@@ -1138,7 +1143,7 @@ public:
   using __base_type::operator=;
 
   template <size_t _Ip, class... _Args>
-  _LIBCUDACXX_INLINE_VISIBILITY auto& __emplace(_Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI auto& __emplace(_Args&&... __args)
   {
     this->__destroy();
     auto& __res = this->__construct_alt(__access::__base::__get_alt<_Ip>(*this), _CUDA_VSTD::forward<_Args>(__args)...);
@@ -1153,7 +1158,7 @@ protected:
     class _Arg,
     enable_if_t<_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Arg) || !_CCCL_TRAIT(is_nothrow_move_constructible, _Tp),
                 int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY void __assign_alt(__alt<_Ip, _Tp>& __a, _Arg&& __arg)
+  _LIBCUDACXX_HIDE_FROM_ABI void __assign_alt(__alt<_Ip, _Tp>& __a, _Arg&& __arg)
   {
     if (this->index() == _Ip)
     {
@@ -1171,7 +1176,7 @@ protected:
     class _Arg,
     enable_if_t<!_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Arg) && _CCCL_TRAIT(is_nothrow_move_constructible, _Tp),
                 int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY void __assign_alt(__alt<_Ip, _Tp>& __a, _Arg&& __arg)
+  _LIBCUDACXX_HIDE_FROM_ABI void __assign_alt(__alt<_Ip, _Tp>& __a, _Arg&& __arg)
   {
     if (this->index() == _Ip)
     {
@@ -1184,7 +1189,7 @@ protected:
   }
 
   template <class _That>
-  _LIBCUDACXX_INLINE_VISIBILITY void __generic_assign(_That&& __that)
+  _LIBCUDACXX_HIDE_FROM_ABI void __generic_assign(_That&& __that)
   {
     if (this->valueless_by_exception() && __that.valueless_by_exception())
     {
@@ -1203,32 +1208,32 @@ protected:
 };
 
 template <class _Traits, _Trait = _Traits::__move_assignable_trait>
-class _LIBCUDACXX_TEMPLATE_VIS __move_assignment;
-
-#  define _LIBCUDACXX_VARIANT_MOVE_ASSIGNMENT(move_assignable_trait, move_assignment)            \
-    template <class... _Types>                                                                   \
-    class _LIBCUDACXX_TEMPLATE_VIS __move_assignment<__traits<_Types...>, move_assignable_trait> \
-        : public __assignment<__traits<_Types...>>                                               \
-    {                                                                                            \
-      using __base_type = __assignment<__traits<_Types...>>;                                     \
-                                                                                                 \
-    public:                                                                                      \
-      using __base_type::__base_type;                                                            \
-      using __base_type::operator=;                                                              \
-                                                                                                 \
-      __move_assignment(const __move_assignment&)            = default;                          \
-      __move_assignment(__move_assignment&&)                 = default;                          \
-      ~__move_assignment()                                   = default;                          \
-      __move_assignment& operator=(const __move_assignment&) = default;                          \
-      move_assignment                                                                            \
+class _CCCL_TYPE_VISIBILITY_DEFAULT __move_assignment;
+
+#  define _LIBCUDACXX_VARIANT_MOVE_ASSIGNMENT(move_assignable_trait, move_assignment)                 \
+    template <class... _Types>                                                                        \
+    class _CCCL_TYPE_VISIBILITY_DEFAULT __move_assignment<__traits<_Types...>, move_assignable_trait> \
+        : public __assignment<__traits<_Types...>>                                                    \
+    {                                                                                                 \
+      using __base_type = __assignment<__traits<_Types...>>;                                          \
+                                                                                                      \
+    public:                                                                                           \
+      using __base_type::__base_type;                                                                 \
+      using __base_type::operator=;                                                                   \
+                                                                                                      \
+      _CCCL_HIDE_FROM_ABI __move_assignment(const __move_assignment&)            = default;           \
+      _CCCL_HIDE_FROM_ABI __move_assignment(__move_assignment&&)                 = default;           \
+      _CCCL_HIDE_FROM_ABI ~__move_assignment()                                   = default;           \
+      _CCCL_HIDE_FROM_ABI __move_assignment& operator=(const __move_assignment&) = default;           \
+      move_assignment                                                                                 \
     }
 
-_LIBCUDACXX_VARIANT_MOVE_ASSIGNMENT(_Trait::_TriviallyAvailable,
-                                    __move_assignment& operator=(__move_assignment&& __that) = default;);
+_LIBCUDACXX_VARIANT_MOVE_ASSIGNMENT(
+  _Trait::_TriviallyAvailable, _CCCL_HIDE_FROM_ABI __move_assignment& operator=(__move_assignment&& __that) = default;);
 
 _LIBCUDACXX_VARIANT_MOVE_ASSIGNMENT(
   _Trait::_Available,
-  _LIBCUDACXX_INLINE_VISIBILITY __move_assignment&
+  _LIBCUDACXX_HIDE_FROM_ABI __move_assignment&
   operator=(__move_assignment&& __that) noexcept(
     __all<(_CCCL_TRAIT(is_nothrow_move_constructible, _Types)
            && _CCCL_TRAIT(is_nothrow_move_assignable, _Types))...>::value) {
@@ -1241,30 +1246,32 @@ _LIBCUDACXX_VARIANT_MOVE_ASSIGNMENT(_Trait::_Unavailable, __move_assignment& ope
 #  undef _LIBCUDACXX_VARIANT_MOVE_ASSIGNMENT
 
 template <class _Traits, _Trait = _Traits::__copy_assignable_trait>
-class _LIBCUDACXX_TEMPLATE_VIS __copy_assignment;
-
-#  define _LIBCUDACXX_VARIANT_COPY_ASSIGNMENT(copy_assignable_trait, copy_assignment)            \
-    template <class... _Types>                                                                   \
-    class _LIBCUDACXX_TEMPLATE_VIS __copy_assignment<__traits<_Types...>, copy_assignable_trait> \
-        : public __move_assignment<__traits<_Types...>>                                          \
-    {                                                                                            \
-      using __base_type = __move_assignment<__traits<_Types...>>;                                \
-                                                                                                 \
-    public:                                                                                      \
-      using __base_type::__base_type;                                                            \
-      using __base_type::operator=;                                                              \
-                                                                                                 \
-      __copy_assignment(const __copy_assignment&)                       = default;               \
-      __copy_assignment(__copy_assignment&&)                            = default;               \
-      ~__copy_assignment()                                              = default;               \
-      copy_assignment __copy_assignment& operator=(__copy_assignment&&) = default;               \
+class _CCCL_TYPE_VISIBILITY_DEFAULT __copy_assignment;
+
+#  define _LIBCUDACXX_VARIANT_COPY_ASSIGNMENT(copy_assignable_trait, copy_assignment)                 \
+    template <class... _Types>                                                                        \
+    class _CCCL_TYPE_VISIBILITY_DEFAULT __copy_assignment<__traits<_Types...>, copy_assignable_trait> \
+        : public __move_assignment<__traits<_Types...>>                                               \
+    {                                                                                                 \
+      using __base_type = __move_assignment<__traits<_Types...>>;                                     \
+                                                                                                      \
+    public:                                                                                           \
+      using __base_type::__base_type;                                                                 \
+      using __base_type::operator=;                                                                   \
+                                                                                                      \
+      _CCCL_HIDE_FROM_ABI __copy_assignment(const __copy_assignment&)       = default;                \
+      _CCCL_HIDE_FROM_ABI __copy_assignment(__copy_assignment&&)            = default;                \
+      _CCCL_HIDE_FROM_ABI ~__copy_assignment()                              = default;                \
+      _CCCL_HIDE_FROM_ABI __copy_assignment& operator=(__copy_assignment&&) = default;                \
+      copy_assignment                                                                                 \
     }
 
-_LIBCUDACXX_VARIANT_COPY_ASSIGNMENT(_Trait::_TriviallyAvailable,
-                                    __copy_assignment& operator=(const __copy_assignment& __that) = default;);
+_LIBCUDACXX_VARIANT_COPY_ASSIGNMENT(
+  _Trait::_TriviallyAvailable,
+  _CCCL_HIDE_FROM_ABI __copy_assignment& operator=(const __copy_assignment& __that) = default;);
 
 _LIBCUDACXX_VARIANT_COPY_ASSIGNMENT(
-  _Trait::_Available, _LIBCUDACXX_INLINE_VISIBILITY __copy_assignment& operator=(const __copy_assignment& __that) {
+  _Trait::_Available, _LIBCUDACXX_HIDE_FROM_ABI __copy_assignment& operator=(const __copy_assignment& __that) {
     this->__generic_assign(__that);
     return *this;
   });
@@ -1275,12 +1282,12 @@ _LIBCUDACXX_VARIANT_COPY_ASSIGNMENT(_Trait::_Unavailable,
 #  undef _LIBCUDACXX_VARIANT_COPY_ASSIGNMENT
 
 template <class... _Types>
-class _LIBCUDACXX_TEMPLATE_VIS __impl : public __copy_assignment<__traits<_Types...>>
+class _CCCL_TYPE_VISIBILITY_DEFAULT __impl : public __copy_assignment<__traits<_Types...>>
 {
   using __base_type = __copy_assignment<__traits<_Types...>>;
 
   template <size_t _CurrentIndex>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr void
   __swap_value(integral_constant<size_t, _CurrentIndex>, const size_t __index, __impl& __lhs, __impl& __rhs)
   {
     if (__index == _CurrentIndex)
@@ -1293,7 +1300,7 @@ class _LIBCUDACXX_TEMPLATE_VIS __impl : public __copy_assignment<__traits<_Types
     __swap_value(integral_constant<size_t, _CurrentIndex - 1>{}, __index, __lhs, __rhs);
   }
 
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr void
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr void
   __swap_value(integral_constant<size_t, 0>, const size_t __index, __impl& __lhs, __impl& __rhs)
   {
     if (__index == 0)
@@ -1309,18 +1316,18 @@ class _LIBCUDACXX_TEMPLATE_VIS __impl : public __copy_assignment<__traits<_Types
 
 public:
   using __base_type::__base_type; // get in_place_index_t constructor & friends
-  __impl(__impl const&)            = default;
-  __impl(__impl&&)                 = default;
-  __impl& operator=(__impl const&) = default;
-  __impl& operator=(__impl&&)      = default;
+  _CCCL_HIDE_FROM_ABI __impl(__impl const&)            = default;
+  _CCCL_HIDE_FROM_ABI __impl(__impl&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __impl& operator=(__impl const&) = default;
+  _CCCL_HIDE_FROM_ABI __impl& operator=(__impl&&)      = default;
 
   template <size_t _Ip, class _Arg>
-  _LIBCUDACXX_INLINE_VISIBILITY void __assign(_Arg&& __arg)
+  _LIBCUDACXX_HIDE_FROM_ABI void __assign(_Arg&& __arg)
   {
     this->__assign_alt(__access::__base::__get_alt<_Ip>(*this), _CUDA_VSTD::forward<_Arg>(__arg));
   }
 
-  inline _LIBCUDACXX_INLINE_VISIBILITY void __swap(__impl& __that)
+  _LIBCUDACXX_HIDE_FROM_ABI void __swap(__impl& __that)
   {
     if (this->valueless_by_exception() && __that.valueless_by_exception())
     {
@@ -1346,7 +1353,7 @@ public:
   }
 
 private:
-  inline _LIBCUDACXX_INLINE_VISIBILITY constexpr bool __move_nothrow() const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __move_nothrow() const
   {
     constexpr bool __results[] = {_CCCL_TRAIT(is_nothrow_move_constructible, _Types)...};
     return this->valueless_by_exception() || __results[this->index()];
@@ -1388,14 +1395,14 @@ template <class _Tp, size_t _Idx>
 struct __overload
 {
   template <class _Up>
-  _LIBCUDACXX_INLINE_VISIBILITY auto operator()(_Tp, _Up&&) const -> __check_for_narrowing<_Tp, _Up>;
+  _LIBCUDACXX_HIDE_FROM_ABI auto operator()(_Tp, _Up&&) const -> __check_for_narrowing<_Tp, _Up>;
 };
 
 template <class _Tp, size_t>
 struct __overload_bool
 {
   template <class _Up, class _Ap = __remove_cvref_t<_Up>>
-  _LIBCUDACXX_INLINE_VISIBILITY auto
+  _LIBCUDACXX_HIDE_FROM_ABI auto
   operator()(bool, _Up&&) const -> enable_if_t<_CCCL_TRAIT(is_same, _Ap, bool), __type_identity<_Tp>>;
 };
 
@@ -1416,7 +1423,7 @@ struct __overload<bool const volatile, _Idx> : __overload_bool<bool const volati
 template <class... _Bases>
 struct __all_overloads : _Bases...
 {
-  _LIBCUDACXX_INLINE_VISIBILITY void operator()() const;
+  _LIBCUDACXX_HIDE_FROM_ABI void operator()() const;
   using _Bases::operator()...;
 };
 #  else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv
@@ -1519,7 +1526,7 @@ struct __variant_constraints
 } // namespace __variant_detail
 
 template <class... _Types>
-class _LIBCUDACXX_TEMPLATE_VIS variant
+class _CCCL_TYPE_VISIBILITY_DEFAULT variant
     : private __sfinae_base<
         __all<_CCCL_TRAIT(is_copy_constructible, _Types)...>::value,
         __all<_CCCL_TRAIT(is_move_constructible, _Types)...>::value,
@@ -1543,12 +1550,12 @@ public:
   // Needs to be dependent to guard against incomplete types
   template <bool _Dummy = true,
             class       = enable_if_t<__dependent_type<is_default_constructible<__first_type>, _Dummy>::value>>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr variant() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, __first_type))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr variant() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, __first_type))
       : __impl_(in_place_index<0>)
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr variant(const variant&) = default;
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr variant(variant&&)      = default;
+  _CCCL_HIDE_FROM_ABI constexpr variant(const variant&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr variant(variant&&)      = default;
 
   template <class _Arg>
   using __match_construct =
@@ -1562,7 +1569,7 @@ public:
             class              = enable_if_t<sizeof...(_Types) != 0>,
             class _Constraints = __match_construct<_Arg>,
             class              = enable_if_t<_Constraints::__constructible>>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr variant(_Arg&& __arg) noexcept(_Constraints::__nothrow_constructible)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr variant(_Arg&& __arg) noexcept(_Constraints::__nothrow_constructible)
       : __impl_(in_place_index<_Constraints::_Ip>, _CUDA_VSTD::forward<_Arg>(__arg))
   {}
 
@@ -1576,7 +1583,7 @@ public:
             class... _Args,
             class _Constraints = __variadic_construct<_Ip, _Args...>,
             class              = enable_if_t<_Constraints::__constructible>>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr variant(in_place_index_t<_Ip>, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr variant(in_place_index_t<_Ip>, _Args&&... __args) noexcept(
     _Constraints::__nothrow_constructible)
       : __impl_(in_place_index<_Ip>, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
@@ -1592,7 +1599,7 @@ public:
             class... _Args,
             class _Constraints = __variadic_ilist_construct<_Ip, _Up, _Args...>,
             class              = enable_if_t<_Constraints::__constructible>>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr variant(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr variant(
     in_place_index_t<_Ip>, initializer_list<_Up> __il, _Args&&... __args) noexcept(_Constraints::__nothrow_constructible)
       : __impl_(in_place_index<_Ip>, __il, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
@@ -1602,7 +1609,7 @@ public:
             size_t _Ip         = __find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
             class _Constraints = __variadic_construct<_Ip, _Args...>,
             class              = enable_if_t<_Constraints::__constructible>>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr variant(in_place_type_t<_Tp>, _Args&&... __args) noexcept(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr variant(in_place_type_t<_Tp>, _Args&&... __args) noexcept(
     _Constraints::__nothrow_constructible)
       : __impl_(in_place_index<_Ip>, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
@@ -1613,15 +1620,15 @@ public:
             size_t _Ip         = __find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
             class _Constraints = __variadic_ilist_construct<_Ip, _Up, _Args...>,
             class              = enable_if_t<_Constraints::__constructible>>
-  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr variant(
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr variant(
     in_place_type_t<_Tp>, initializer_list<_Up> __il, _Args&&... __args) noexcept(_Constraints::__nothrow_constructible)
       : __impl_(in_place_index<_Ip>, __il, _CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
-  _LIBCUDACXX_HIDE_FROM_ABI ~variant() = default;
+  _CCCL_HIDE_FROM_ABI ~variant() = default;
 
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr variant& operator=(const variant&) = default;
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr variant& operator=(variant&&)      = default;
+  _CCCL_HIDE_FROM_ABI constexpr variant& operator=(const variant&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr variant& operator=(variant&&)      = default;
 
   template <class _Arg>
   using __match_assign =
@@ -1630,7 +1637,7 @@ public:
         __variant_detail::__invalid_variant_constraints>;
 
   template <class _Arg, class _Constraints = __match_assign<_Arg>, class = enable_if_t<_Constraints::__assignable>>
-  _LIBCUDACXX_INLINE_VISIBILITY variant& operator=(_Arg&& __arg) noexcept(_Constraints::__nothrow_assignable)
+  _LIBCUDACXX_HIDE_FROM_ABI variant& operator=(_Arg&& __arg) noexcept(_Constraints::__nothrow_assignable)
   {
     __impl_.template __assign<_Constraints::_Ip>(_CUDA_VSTD::forward<_Arg>(__arg));
     return *this;
@@ -1640,7 +1647,7 @@ public:
             class... _Args,
             class _Constraints = __variadic_construct<_Ip, _Args...>,
             class              = enable_if_t<_Constraints::__constructible>>
-  _LIBCUDACXX_INLINE_VISIBILITY typename _Constraints::_Tp& emplace(_Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI typename _Constraints::_Tp& emplace(_Args&&... __args)
   {
     return __impl_.template __emplace<_Ip>(_CUDA_VSTD::forward<_Args>(__args)...);
   }
@@ -1650,7 +1657,7 @@ public:
             class... _Args,
             class _Constraints = __variadic_ilist_construct<_Ip, _Up, _Args...>,
             class              = enable_if_t<_Constraints::__constructible>>
-  _LIBCUDACXX_INLINE_VISIBILITY typename _Constraints::_Tp& emplace(initializer_list<_Up> __il, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI typename _Constraints::_Tp& emplace(initializer_list<_Up> __il, _Args&&... __args)
   {
     return __impl_.template __emplace<_Ip>(__il, _CUDA_VSTD::forward<_Args>(__args)...);
   }
@@ -1659,7 +1666,7 @@ public:
             class... _Args,
             size_t _Ip = __find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
             enable_if_t<_CCCL_TRAIT(is_constructible, _Tp, _Args...), int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp& emplace(_Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp& emplace(_Args&&... __args)
   {
     return __impl_.template __emplace<_Ip>(_CUDA_VSTD::forward<_Args>(__args)...);
   }
@@ -1669,17 +1676,17 @@ public:
             class... _Args,
             size_t _Ip = __find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
             enable_if_t<_CCCL_TRAIT(is_constructible, _Tp, initializer_list<_Up>&, _Args...), int> = 0>
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args)
+  _LIBCUDACXX_HIDE_FROM_ABI _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args)
   {
     return __impl_.template __emplace<_Ip>(__il, _CUDA_VSTD::forward<_Args>(__args)...);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool valueless_by_exception() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool valueless_by_exception() const noexcept
   {
     return __impl_.valueless_by_exception();
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t index() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t index() const noexcept
   {
     return __impl_.index();
   }
@@ -1692,12 +1699,12 @@ public:
   template <bool _Dummy       = true,
             class _Constraint = __swap_constraint<_Dummy>,
             class             = enable_if_t<_Constraint::__is_swappable>>
-  _LIBCUDACXX_INLINE_VISIBILITY void swap(variant& __that) noexcept(_Constraint::__is_nothrow_swappable)
+  _LIBCUDACXX_HIDE_FROM_ABI void swap(variant& __that) noexcept(_Constraint::__is_nothrow_swappable)
   {
     __impl_.__swap(__that.__impl_);
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY static constexpr size_t __size() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t __size() noexcept
   {
     return sizeof...(_Types);
   }
@@ -1710,19 +1717,19 @@ private:
 };
 
 template <size_t _Ip, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool __holds_alternative(const variant<_Types...>& __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __holds_alternative(const variant<_Types...>& __v) noexcept
 {
   return __v.index() == _Ip;
 }
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool holds_alternative(const variant<_Types...>& __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool holds_alternative(const variant<_Types...>& __v) noexcept
 {
   return _CUDA_VSTD::__holds_alternative<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
 }
 
 template <size_t _Ip, class _Vp>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr auto&& __generic_get(_Vp&& __v)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr auto&& __generic_get(_Vp&& __v)
 {
   using __variant_detail::__access::__variant;
   if (!_CUDA_VSTD::__holds_alternative<_Ip>(__v))
@@ -1733,7 +1740,7 @@ _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
 }
 
 template <size_t _Ip, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr variant_alternative_t<_Ip, variant<_Types...>>&
 get(variant<_Types...>& __v)
 {
@@ -1743,7 +1750,7 @@ get(variant<_Types...>& __v)
 }
 
 template <size_t _Ip, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr variant_alternative_t<_Ip, variant<_Types...>>&&
 get(variant<_Types...>&& __v)
 {
@@ -1753,7 +1760,7 @@ get(variant<_Types...>&& __v)
 }
 
 template <size_t _Ip, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr const variant_alternative_t<_Ip, variant<_Types...>>&
 get(const variant<_Types...>& __v)
 {
@@ -1763,7 +1770,7 @@ get(const variant<_Types...>& __v)
 }
 
 template <size_t _Ip, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY
+_LIBCUDACXX_HIDE_FROM_ABI
 _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr const variant_alternative_t<_Ip, variant<_Types...>>&&
 get(const variant<_Types...>&& __v)
 {
@@ -1773,23 +1780,21 @@ get(const variant<_Types...>&& __v)
 }
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr _Tp&
-get(variant<_Types...>& __v)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr _Tp& get(variant<_Types...>& __v)
 {
   static_assert(!_CCCL_TRAIT(is_void, _Tp), "");
   return _CUDA_VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
 }
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr _Tp&&
-get(variant<_Types...>&& __v)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr _Tp&& get(variant<_Types...>&& __v)
 {
   static_assert(!_CCCL_TRAIT(is_void, _Tp), "");
   return _CUDA_VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(_CUDA_VSTD::move(__v));
 }
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr const _Tp&
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr const _Tp&
 get(const variant<_Types...>& __v)
 {
   static_assert(!_CCCL_TRAIT(is_void, _Tp), "");
@@ -1797,7 +1802,7 @@ get(const variant<_Types...>& __v)
 }
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr const _Tp&&
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr const _Tp&&
 get(const variant<_Types...>&& __v)
 {
   static_assert(!_CCCL_TRAIT(is_void, _Tp), "");
@@ -1805,7 +1810,7 @@ get(const variant<_Types...>&& __v)
 }
 
 template <size_t _Ip, class _Vp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto* __generic_get_if(_Vp* __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto* __generic_get_if(_Vp* __v) noexcept
 {
   using __variant_detail::__access::__variant;
   return __v && _CUDA_VSTD::__holds_alternative<_Ip>(*__v)
@@ -1814,7 +1819,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr auto* __generic_get_if(_Vp* __v) noexcep
 }
 
 template <size_t _Ip, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr add_pointer_t<variant_alternative_t<_Ip, variant<_Types...>>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr add_pointer_t<variant_alternative_t<_Ip, variant<_Types...>>>
 get_if(variant<_Types...>* __v) noexcept
 {
   static_assert(_Ip < sizeof...(_Types), "");
@@ -1823,7 +1828,7 @@ get_if(variant<_Types...>* __v) noexcept
 }
 
 template <size_t _Ip, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr add_pointer_t<const variant_alternative_t<_Ip, variant<_Types...>>>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr add_pointer_t<const variant_alternative_t<_Ip, variant<_Types...>>>
 get_if(const variant<_Types...>* __v) noexcept
 {
   static_assert(_Ip < sizeof...(_Types), "");
@@ -1832,14 +1837,14 @@ get_if(const variant<_Types...>* __v) noexcept
 }
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr add_pointer_t<_Tp> get_if(variant<_Types...>* __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr add_pointer_t<_Tp> get_if(variant<_Types...>* __v) noexcept
 {
   static_assert(!_CCCL_TRAIT(is_void, _Tp), "");
   return _CUDA_VSTD::get_if<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
 }
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr add_pointer_t<const _Tp> get_if(const variant<_Types...>* __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr add_pointer_t<const _Tp> get_if(const variant<_Types...>* __v) noexcept
 {
   static_assert(!_CCCL_TRAIT(is_void, _Tp), "");
   return _CUDA_VSTD::get_if<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
@@ -1849,7 +1854,7 @@ template <class _Operator>
 struct __convert_to_bool
 {
   template <class _T1, class _T2>
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator()(_T1&& __t1, _T2&& __t2) const
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator()(_T1&& __t1, _T2&& __t2) const
   {
     static_assert(
       _CCCL_TRAIT(
@@ -1863,7 +1868,7 @@ struct __convert_to_bool
 struct __variant_binary_visitor
 {
   template <class _BinaryOp, class _LeftVariant, class _RightVariant>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr auto
   __visit(const size_t __index, _BinaryOp&& __op, const _LeftVariant& __lhs, const _RightVariant& __rhs)
   {
     return __visit(integral_constant<size_t, variant_size_v<_LeftVariant> - 1>{},
@@ -1875,7 +1880,7 @@ struct __variant_binary_visitor
 
 private:
   template <size_t _CurrentIndex, class _BinaryOp, class _LeftVariant, class _RightVariant>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr auto
   __visit(integral_constant<size_t, _CurrentIndex>,
           const size_t __index,
           _BinaryOp&& __op,
@@ -1891,7 +1896,7 @@ private:
   }
 
   template <class _BinaryOp, class _LeftVariant, class _RightVariant>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static constexpr auto
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static constexpr auto
   __visit(integral_constant<size_t, 0>,
           const size_t __index,
           _BinaryOp&& __op,
@@ -1908,7 +1913,7 @@ private:
 };
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator==(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
 {
   const auto __index = __lhs.index();
   if (__index != __rhs.index())
@@ -1927,7 +1932,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator==(const variant<_Types...>
 
 template <class... _Types>
   requires(three_way_comparable<_Types> && ...)
-_LIBCUDACXX_INLINE_VISIBILITY constexpr common_comparison_category_t<compare_three_way_result_t<_Types>...>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr common_comparison_category_t<compare_three_way_result_t<_Types>...>
 operator<=>(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
 {
   using __result_t = common_comparison_category_t<compare_three_way_result_t<_Types>...>;
@@ -1956,7 +1961,7 @@ operator<=>(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
 #  endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator!=(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
 {
   if (__lhs.index() != __rhs.index())
   {
@@ -1970,7 +1975,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator!=(const variant<_Types...>
 }
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
 {
   if (__rhs.valueless_by_exception())
   {
@@ -1992,7 +1997,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<(const variant<_Types...>&
 }
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
 {
   if (__lhs.valueless_by_exception())
   {
@@ -2014,7 +2019,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>(const variant<_Types...>&
 }
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<=(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator<=(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
 {
   if (__lhs.valueless_by_exception())
   {
@@ -2036,7 +2041,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator<=(const variant<_Types...>
 }
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>=(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(const variant<_Types...>& __lhs, const variant<_Types...>& __rhs)
 {
   if (__rhs.valueless_by_exception())
   {
@@ -2058,7 +2063,7 @@ _LIBCUDACXX_INLINE_VISIBILITY constexpr bool operator>=(const variant<_Types...>
 }
 
 template <class... _Vs>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr void
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr void
 __throw_if_valueless(_Vs&&... __vs)
 {
   int __unused[] = {
@@ -2070,7 +2075,7 @@ __throw_if_valueless(_Vs&&... __vs)
 template <class _Visitor,
           class... _Vs,
           typename = void_t<decltype(_CUDA_VSTD::__as_variant(_CUDA_VSTD::declval<_Vs>()))...>>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr decltype(auto)
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr decltype(auto)
 visit(_Visitor&& __visitor, _Vs&&... __vs)
 {
   using __variant_detail::__visitation::__variant;
@@ -2082,7 +2087,7 @@ template <class _Rp,
           class _Visitor,
           class... _Vs,
           typename = void_t<decltype(_CUDA_VSTD::__as_variant(_CUDA_VSTD::declval<_Vs>()))...>>
-_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr _Rp
+_LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr _Rp
 visit(_Visitor&& __visitor, _Vs&&... __vs)
 {
   using __variant_detail::__visitation::__variant;
@@ -2091,7 +2096,7 @@ visit(_Visitor&& __visitor, _Vs&&... __vs)
 }
 
 template <class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY auto swap(variant<_Types...>& __lhs, variant<_Types...>& __rhs) noexcept(
+_LIBCUDACXX_HIDE_FROM_ABI auto swap(variant<_Types...>& __lhs, variant<_Types...>& __rhs) noexcept(
   noexcept(__lhs.swap(__rhs))) -> decltype(__lhs.swap(__rhs))
 {
   return __lhs.swap(__rhs);
@@ -2099,13 +2104,13 @@ _LIBCUDACXX_INLINE_VISIBILITY auto swap(variant<_Types...>& __lhs, variant<_Type
 
 #  ifndef __cuda_std__
 template <class... _Types>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<__enable_hash_helper<variant<_Types...>, remove_const_t<_Types>...>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT hash<__enable_hash_helper<variant<_Types...>, remove_const_t<_Types>...>>
 {
   using argument_type = variant<_Types...>;
   using result_type   = size_t;
 
   template <size_t _CurrentIndex>
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static size_t
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static size_t
   __hash(integral_constant<size_t, _CurrentIndex>, const size_t __index, const argument_type& __v) noexcept
   {
     if (__index == _CurrentIndex)
@@ -2115,7 +2120,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<__enable_hash_helper<variant<_Types...>, re
     }
     __hash(integral_constant<size_t, _CurrentIndex - 1>{}, __index, __v);
   }
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY static size_t
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN static size_t
   __hash(integral_constant<size_t, 0>, const size_t __index, const argument_type& __v) noexcept
   {
     if (__index == 0)
@@ -2127,7 +2132,7 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<__enable_hash_helper<variant<_Types...>, re
     _LIBCUDACXX_UNREACHABLE();
   }
 
-  inline _LIBCUDACXX_HIDDEN _LIBCUDACXX_INLINE_VISIBILITY result_type operator()(const argument_type& __v) const
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_VISIBILITY_HIDDEN result_type operator()(const argument_type& __v) const
   {
     size_t __res = __v.valueless_by_exception()
                    ? 299792458 // Random value chosen by the universe upon creation
@@ -2141,20 +2146,20 @@ struct _LIBCUDACXX_TEMPLATE_VIS hash<__enable_hash_helper<variant<_Types...>, re
 // with the wrong type whereas _CUDA_VSTD::get will throw or returning nullptr.
 // This makes it faster than _CUDA_VSTD::get.
 template <size_t _Ip, class _Vp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto&& __unchecked_get(_Vp&& __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto&& __unchecked_get(_Vp&& __v) noexcept
 {
   using __variant_detail::__access::__variant;
   return __variant::__get_alt<_Ip>(_CUDA_VSTD::forward<_Vp>(__v)).__value;
 }
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto&& __unchecked_get(const variant<_Types...>& __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto&& __unchecked_get(const variant<_Types...>& __v) noexcept
 {
   return _CUDA_VSTD::__unchecked_get<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
 }
 
 template <class _Tp, class... _Types>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto&& __unchecked_get(variant<_Types...>& __v) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto&& __unchecked_get(variant<_Types...>& __v) noexcept
 {
   return _CUDA_VSTD::__unchecked_get<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
 }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version
deleted file mode 100644
index 08f3368192..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/version
+++ /dev/null
@@ -1,438 +0,0 @@
-// -*- C++ -*-
-//===--------------------------- version ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_VERSIONH
-#define _LIBCUDACXX_VERSIONH
-
-/*
-  version synopsis
-
-Macro name                                              Value   Headers
-__cpp_lib_adaptor_iterator_pair_constructor             202106L <queue> <stack>
-__cpp_lib_addressof_constexpr                           201603L <memory>
-__cpp_lib_allocate_at_least                             202106L <memory>
-__cpp_lib_allocator_traits_is_always_equal              201411L <deque> <forward_list> <list>
-                                                                <map> <memory> <scoped_allocator>
-                                                                <set> <string> <unordered_map>
-                                                                <unordered_set> <vector>
-__cpp_lib_any                                           201606L <any>
-__cpp_lib_apply                                         201603L <tuple>
-__cpp_lib_array_constexpr                               201811L <array> <iterator>
-                                                        201603L // C++17
-__cpp_lib_as_const                                      201510L <utility>
-__cpp_lib_associative_heterogeneous_erasure             202110L <map> <set> <unordered_map>
-                                                                <unordered_set>
-__cpp_lib_assume_aligned                                201811L <memory>
-__cpp_lib_atomic_flag_test                              201907L <atomic>
-__cpp_lib_atomic_float                                  201711L <atomic>
-__cpp_lib_atomic_is_always_lock_free                    201603L <atomic>
-__cpp_lib_atomic_lock_free_type_aliases                 201907L <atomic>
-__cpp_lib_atomic_ref                                    201806L <atomic>
-__cpp_lib_atomic_shared_ptr                             201711L <atomic>
-__cpp_lib_atomic_value_initialization                   201911L <atomic> <memory>
-__cpp_lib_atomic_wait                                   201907L <atomic>
-__cpp_lib_barrier                                       201907L <barrier>
-__cpp_lib_bind_back                                     202202L <functional>
-__cpp_lib_bind_front                                    201907L <functional>
-__cpp_lib_bit_cast                                      201806L <bit>
-__cpp_lib_bitops                                        201907L <bit>
-__cpp_lib_bool_constant                                 201505L <type_traits>
-__cpp_lib_bounded_array_traits                          201902L <type_traits>
-__cpp_lib_boyer_moore_searcher                          201603L <functional>
-__cpp_lib_byte                                          201603L <cstddef>
-__cpp_lib_byteswap                                      202110L <bit>
-__cpp_lib_char8_t                                       201907L <atomic> <filesystem> <istream>
-                                                                <limits> <locale> <ostream>
-                                                                <string> <string_view>
-__cpp_lib_chrono                                        201611L <chrono>
-__cpp_lib_chrono_udls                                   201304L <chrono>
-__cpp_lib_clamp                                         201603L <algorithm>
-__cpp_lib_complex_udls                                  201309L <complex>
-__cpp_lib_concepts                                      202002L <concepts>
-__cpp_lib_constexpr_algorithms                          201806L <algorithm>
-__cpp_lib_constexpr_bitset                              202207L <bitset>
-__cpp_lib_constexpr_charconv                            202207L <charconv>
-__cpp_lib_constexpr_cmath                               202202L <cmath> <cstdlib>
-__cpp_lib_constexpr_complex                             201711L <complex>
-__cpp_lib_constexpr_dynamic_alloc                       201907L <memory>
-__cpp_lib_constexpr_functional                          201907L <functional>
-__cpp_lib_constexpr_iterator                            201811L <iterator>
-__cpp_lib_constexpr_memory                              202202L <memory>
-                                                        201811L // C++20
-__cpp_lib_constexpr_numeric                             201911L <numeric>
-__cpp_lib_constexpr_string                              201907L <string>
-__cpp_lib_constexpr_string_view                         201811L <string_view>
-__cpp_lib_constexpr_tuple                               201811L <tuple>
-__cpp_lib_constexpr_typeinfo                            202106L <typeinfo>
-__cpp_lib_constexpr_utility                             201811L <utility>
-__cpp_lib_constexpr_vector                              201907L <vector>
-__cpp_lib_coroutine                                     201902L <coroutine>
-__cpp_lib_destroying_delete                             201806L <new>
-__cpp_lib_enable_shared_from_this                       201603L <memory>
-__cpp_lib_endian                                        201907L <bit>
-__cpp_lib_erase_if                                      202002L <deque> <forward_list> <list>
-                                                                <map> <set> <string>
-                                                                <unordered_map> <unordered_set> <vector>
-__cpp_lib_exchange_function                             201304L <utility>
-__cpp_lib_execution                                     201902L <execution>
-                                                        201603L // C++17
-__cpp_lib_filesystem                                    201703L <filesystem>
-__cpp_lib_format                                        202106L <format>
-__cpp_lib_forward_like                                  202207L <utility>
-__cpp_lib_gcd_lcm                                       201606L <numeric>
-__cpp_lib_generic_associative_lookup                    201304L <map> <set>
-__cpp_lib_generic_unordered_lookup                      201811L <unordered_map> <unordered_set>
-__cpp_lib_hardware_interference_size                    201703L <new>
-__cpp_lib_has_unique_object_representations             201606L <type_traits>
-__cpp_lib_hypot                                         201603L <cmath>
-__cpp_lib_incomplete_container_elements                 201505L <forward_list> <list> <vector>
-__cpp_lib_int_pow2                                      202002L <bit>
-__cpp_lib_integer_comparison_functions                  202002L <utility>
-__cpp_lib_integer_sequence                              201304L <utility>
-__cpp_lib_integral_constant_callable                    201304L <type_traits>
-__cpp_lib_interpolate                                   201902L <cmath> <numeric>
-__cpp_lib_invoke                                        201411L <functional>
-__cpp_lib_invoke_r                                      202106L <functional>
-__cpp_lib_is_aggregate                                  201703L <type_traits>
-__cpp_lib_is_constant_evaluated                         201811L <type_traits>
-__cpp_lib_is_final                                      201402L <type_traits>
-__cpp_lib_is_invocable                                  201703L <type_traits>
-__cpp_lib_is_layout_compatible                          201907L <type_traits>
-__cpp_lib_is_nothrow_convertible                        201806L <type_traits>
-__cpp_lib_is_null_pointer                               201309L <type_traits>
-__cpp_lib_is_pointer_interconvertible                   201907L <type_traits>
-__cpp_lib_is_scoped_enum                                202011L <type_traits>
-__cpp_lib_is_swappable                                  201603L <type_traits>
-__cpp_lib_jthread                                       201911L <stop_token> <thread>
-__cpp_lib_latch                                         201907L <latch>
-__cpp_lib_launder                                       201606L <new>
-__cpp_lib_list_remove_return_type                       201806L <forward_list> <list>
-__cpp_lib_logical_traits                                201510L <type_traits>
-__cpp_lib_make_from_tuple                               201606L <tuple>
-__cpp_lib_make_reverse_iterator                         201402L <iterator>
-__cpp_lib_make_unique                                   201304L <memory>
-__cpp_lib_map_try_emplace                               201411L <map>
-__cpp_lib_math_constants                                201907L <numbers>
-__cpp_lib_math_special_functions                        201603L <cmath>
-__cpp_lib_memory_resource                               201603L <memory_resource>
-__cpp_lib_move_only_function                            202110L <functional>
-__cpp_lib_node_extract                                  201606L <map> <set> <unordered_map>
-                                                                <unordered_set>
-__cpp_lib_nonmember_container_access                    201411L <array> <deque> <forward_list>
-                                                                <iterator> <list> <map>
-                                                                <regex> <set> <string>
-                                                                <unordered_map> <unordered_set> <vector>
-__cpp_lib_not_fn                                        201603L <functional>
-__cpp_lib_null_iterators                                201304L <iterator>
-__cpp_lib_optional                                      202110L <optional>
-                                                        201606L // C++17
-__cpp_lib_out_ptr                                       202106L <memory>
-__cpp_lib_parallel_algorithm                            201603L <algorithm> <numeric>
-__cpp_lib_polymorphic_allocator                         201902L <memory_resource>
-__cpp_lib_quoted_string_io                              201304L <iomanip>
-__cpp_lib_ranges                                        201811L <algorithm> <functional> <iterator>
-                                                                <memory> <ranges>
-__cpp_lib_ranges_chunk                                  202202L <ranges>
-__cpp_lib_ranges_chunk_by                               202202L <ranges>
-__cpp_lib_ranges_iota                                   202202L <numeric>
-__cpp_lib_ranges_join_with                              202202L <ranges>
-__cpp_lib_ranges_slide                                  202202L <ranges>
-__cpp_lib_ranges_starts_ends_with                       202106L <algorithm>
-__cpp_lib_ranges_to_container                           202202L <deque> <forward_list> <list>
-                                                                <map> <priority_queue> <queue>
-                                                                <set> <stack> <string>
-                                                                <unordered_map> <unordered_set> <vector>
-__cpp_lib_ranges_zip                                    202110L <ranges> <tuple> <utility>
-__cpp_lib_raw_memory_algorithms                         201606L <memory>
-__cpp_lib_reference_from_temporary                      202202L <type_traits>
-__cpp_lib_remove_cvref                                  201711L <type_traits>
-__cpp_lib_result_of_sfinae                              201210L <functional> <type_traits>
-__cpp_lib_robust_nonmodifying_seq_ops                   201304L <algorithm>
-__cpp_lib_sample                                        201603L <algorithm>
-__cpp_lib_scoped_lock                                   201703L <mutex>
-__cpp_lib_semaphore                                     201907L <semaphore>
-__cpp_lib_shared_mutex                                  201505L <shared_mutex>
-__cpp_lib_shared_ptr_arrays                             201707L <memory>
-                                                        201611L // C++17
-__cpp_lib_shared_ptr_weak_type                          201606L <memory>
-__cpp_lib_shared_timed_mutex                            201402L <shared_mutex>
-__cpp_lib_shift                                         201806L <algorithm>
-__cpp_lib_smart_ptr_for_overwrite                       202002L <memory>
-__cpp_lib_source_location                               201907L <source_location>
-__cpp_lib_span                                          202002L <span>
-__cpp_lib_spanstream                                    202106L <spanstream>
-__cpp_lib_ssize                                         201902L <iterator>
-__cpp_lib_stacktrace                                    202011L <stacktrace>
-__cpp_lib_starts_ends_with                              201711L <string> <string_view>
-__cpp_lib_stdatomic_h                                   202011L <stdatomic.h>
-__cpp_lib_string_contains                               202011L <string> <string_view>
-__cpp_lib_string_resize_and_overwrite                   202110L <string>
-__cpp_lib_string_udls                                   201304L <string>
-__cpp_lib_string_view                                   201803L <string> <string_view>
-                                                        201606L // C++17
-__cpp_lib_syncbuf                                       201803L <syncstream>
-__cpp_lib_three_way_comparison                          201907L <compare>
-__cpp_lib_to_address                                    201711L <memory>
-__cpp_lib_to_array                                      201907L <array>
-__cpp_lib_to_chars                                      201611L <charconv>
-__cpp_lib_to_underlying                                 202102L <utility>
-__cpp_lib_transformation_trait_aliases                  201304L <type_traits>
-__cpp_lib_transparent_operators                         201510L <functional> <memory>
-                                                        201210L // C++14
-__cpp_lib_tuple_element_t                               201402L <tuple>
-__cpp_lib_tuples_by_type                                201304L <tuple> <utility>
-__cpp_lib_type_identity                                 201806L <type_traits>
-__cpp_lib_type_trait_variable_templates                 201510L <type_traits>
-__cpp_lib_uncaught_exceptions                           201411L <exception>
-__cpp_lib_unordered_map_try_emplace                     201411L <unordered_map>
-__cpp_lib_unreachable                                   202202L <utility>
-__cpp_lib_unwrap_ref                                    201811L <functional>
-__cpp_lib_variant                                       202102L <variant>
-__cpp_lib_void_t                                        201411L <type_traits>
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-// We need to define our own macros to not conflict with the host stl.
-// At the same time we want bring in all feature test macros from host
-#if __has_include(<version>) // <version> should be the smallest include possible
-#  include <version>
-#elif !defined(_CCCL_COMPILER_NVRTC)
-#  include <ciso646> // otherwise go for the smallest possible header
-#endif
-
-#if _CCCL_STD_VER > 2011
-#  define __cccl_lib_chrono_udls  201304L
-#  define __cccl_lib_complex_udls 201309L
-#  ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED
-#    define __cccl_lib_constexpr_complex 201711L
-#  endif
-#  define __cccl_lib_concepts          202002L
-#  define __cccl_lib_exchange_function 201304L
-#  define __cccl_lib_expected          202211L
-// # define __cccl_lib_generic_associative_lookup           201304L
-#  define __cccl_lib_integer_sequence           201304L
-#  define __cccl_lib_integral_constant_callable 201304L
-#  define __cccl_lib_is_final                   201402L
-#  define __cccl_lib_is_null_pointer            201309L
-#  define __cccl_lib_make_reverse_iterator      201402L
-// # define __cccl_lib_make_unique                          201304L
-#  define __cccl_lib_null_iterators 201304L
-#  define __cccl_lib_optional       202110L
-// # define __cccl_lib_quoted_string_io                     201304L
-#  define __cccl_lib_result_of_sfinae            201210L
-#  define __cccl_lib_robust_nonmodifying_seq_ops 201304L
-#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
-// #   define __cccl_lib_shared_timed_mutex                 201402L
-#  endif
-#  define __cccl_lib_span 202002L
-// # define __cccl_lib_string_udls                          201304L
-#  define __cccl_lib_transformation_trait_aliases 201304L
-#  define __cccl_lib_transparent_operators        201210L
-#  define __cccl_lib_tuple_element_t              201402L
-#  define __cccl_lib_tuples_by_type               201304L
-#endif // _CCCL_STD_VER > 2011
-
-#if _CCCL_STD_VER > 2014
-#  if defined(_LIBCUDACXX_ADDRESSOF)
-#    define __cccl_lib_addressof_constexpr 201603L
-#  endif
-// # define __cccl_lib_allocator_traits_is_always_equal     201411L
-// # define __cccl_lib_any                                  201606L
-#  define __cccl_lib_apply           201603L
-#  define __cccl_lib_array_constexpr 201603L
-#  define __cccl_lib_as_const        201510L
-#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
-#    define __cccl_lib_atomic_is_always_lock_free 201603L
-#  endif
-#  define __cccl_lib_bind_front    201907L
-#  define __cccl_lib_bool_constant 201505L
-// # define __cccl_lib_boyer_moore_searcher                 201603L
-#  define __cccl_lib_byte   201603L
-#  define __cccl_lib_chrono 201611L
-// # define __cccl_lib_clamp                                201603L
-// # define __cccl_lib_enable_shared_from_this              201603L
-// # define __cccl_lib_execution                            201603L
-// # define __cccl_lib_filesystem                           201703L
-#  define __cccl_lib_gcd_lcm                    201606L
-#  define __cccl_lib_hardware_interference_size 201703L
-#  if defined(_LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS)
-#    define __cccl_lib_has_unique_object_representations 201606L
-#  endif
-#  define __cccl_lib_hypot 201603L
-// # define __cccl_lib_incomplete_container_elements        201505L
-#  define __cccl_lib_invoke 201411L
-#  if !defined(_LIBCUDACXX_HAS_NO_IS_AGGREGATE)
-#    define __cccl_lib_is_aggregate 201703L
-#  endif
-#  define __cccl_lib_is_invocable    201703L
-#  define __cccl_lib_is_swappable    201603L
-#  define __cccl_lib_launder         201606L
-#  define __cccl_lib_logical_traits  201510L
-#  define __cccl_lib_make_from_tuple 201606L
-// # define __cccl_lib_map_try_emplace                      201411L
-// # define __cccl_lib_math_special_functions               201603L
-// # define __cccl_lib_memory_resource                      201603L
-// # define __cccl_lib_node_extract                         201606L
-// # define __cccl_lib_nonmember_container_access           201411L
-#  define __cccl_lib_not_fn 201603L
-// # define __cccl_lib_parallel_algorithm                   201603L
-// # define __cccl_lib_raw_memory_algorithms                201606L
-// # define __cccl_lib_sample                               201603L
-// # define __cccl_lib_scoped_lock                          201703L
-#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
-// #   define __cccl_lib_shared_mutex                       201505L
-#  endif
-// # define __cccl_lib_shared_ptr_arrays                    201611L
-// # define __cccl_lib_shared_ptr_weak_type                 201606L
-// # define __cccl_lib_string_view                          201606L
-// # define __cccl_lib_to_chars                             201611L
-#  define __cccl_lib_type_trait_variable_templates 201510L
-#  define __cccl_lib_uncaught_exceptions           201411L
-#  define __cccl_lib_unordered_map_try_emplace     201411L
-#  define __cccl_lib_variant                       201606L
-#  define __cccl_lib_void_t                        201411L
-#endif // _CCCL_STD_VER > 2014
-
-#if _CCCL_STD_VER > 2017
-#  undef __cccl_lib_array_constexpr
-#  define __cccl_lib_array_constexpr 201811L
-// # define __cccl_lib_assume_aligned                       201811L
-#  define __cccl_lib_atomic_flag_test              201907L
-#  define __cccl_lib_atomic_float                  201711L
-#  define __cccl_lib_atomic_lock_free_type_aliases 201907L
-#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
-#    define __cccl_lib_atomic_ref 201806L
-#  endif
-// # define __cccl_lib_atomic_shared_ptr                    201711L
-#  define __cccl_lib_atomic_value_initialization 201911L
-#  if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
-#    define __cccl_lib_atomic_wait 201907L
-#  endif
-#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
-#    define __cccl_lib_barrier 201907L
-#  endif
-#  define __cccl_lib_bit_cast             201806L
-#  define __cccl_lib_bitops               201907L
-#  define __cccl_lib_bounded_array_traits 201902L
-#  if !defined(_LIBCUDACXX_NO_HAS_CHAR8_T)
-#    define __cccl_lib_char8_t 201811L
-#  endif
-// # define __cccl_lib_constexpr_algorithms                 201806L
-// # define __cccl_lib_constexpr_dynamic_alloc              201907L
-#  define __cccl_lib_constexpr_functional 201907L
-// # define __cccl_lib_constexpr_iterator                   201811L
-// # define __cccl_lib_constexpr_memory                     201811L
-// # define __cccl_lib_constexpr_misc                       201811L
-// # define __cccl_lib_constexpr_numeric                    201911L
-// # define __cccl_lib_constexpr_string                     201907L
-// # define __cccl_lib_constexpr_string_view                201811L
-// # define __cccl_lib_constexpr_swap_algorithms            201806L
-// # define __cccl_lib_constexpr_tuple                      201811L
-// # define __cccl_lib_constexpr_utility                    201811L
-// # define __cccl_lib_constexpr_vector                     201907L
-// # define __cccl_lib_coroutine                            201902L
-#  if defined(__cpp_impl_destroying_delete) && __cpp_impl_destroying_delete >= 201806L \
-    && defined(__cpp_lib_destroying_delete)
-#    define __cccl_lib_destroying_delete 201806L
-#  endif
-// # define __cccl_lib_endian                               201907L
-// # define __cccl_lib_erase_if                             201811L
-// # undef  __cccl_lib_execution
-// # define __cccl_lib_execution                            201902L
-#  if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format) && !defined(_LIBCUDACXX_HAS_NO_INCOMPLETE_FORMAT)
-// #   define __cccl_lib_format                             202106L
-#  endif
-// # define __cccl_lib_generic_unordered_lookup             201811L
-// # define __cccl_lib_int_pow2                             202002L
-// # define __cccl_lib_integer_comparison_functions         202002L
-// # define __cccl_lib_interpolate                          201902L
-#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
-#    define __cccl_lib_is_constant_evaluated 201811L
-#  endif
-// # define __cccl_lib_is_layout_compatible                 201907L
-#  define __cccl_lib_is_nothrow_convertible 201806L
-// # define __cccl_lib_is_pointer_interconvertible          201907L
-#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
-// #   define __cccl_lib_jthread                            201911L
-#  endif
-#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
-// #   define __cccl_lib_latch                              201907L
-#  endif
-// # define __cccl_lib_list_remove_return_type              201806L
-// # define __cccl_lib_math_constants                       201907L
-// # define __cccl_lib_polymorphic_allocator                201902L
-// # define __cccl_lib_ranges                               201811L
-// # define __cccl_lib_remove_cvref                         201711L
-#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
-// #   define __cccl_lib_semaphore                          201907L
-#  endif
-// # undef  __cccl_lib_shared_ptr_arrays
-// # define __cccl_lib_shared_ptr_arrays                    201707L
-// # define __cccl_lib_shift                                201806L
-// # define __cccl_lib_smart_ptr_for_overwrite              202002L
-// # define __cccl_lib_source_location                      201907L
-// # define __cccl_lib_ssize                                201902L
-// # define __cccl_lib_starts_ends_with                     201711L
-// # undef  __cccl_lib_string_view
-// # define __cccl_lib_string_view                          201803L
-// # define __cccl_lib_syncbuf                              201803L
-// # define __cccl_lib_three_way_comparison                 201907L
-// # define __cccl_lib_to_address                           201711L
-// # define __cccl_lib_to_array                             201907L
-// # define __cccl_lib_type_identity                        201806L
-#  define __cccl_lib_unwrap_ref 201811L
-#endif // _CCCL_STD_VER > 2017
-
-#if _CCCL_STD_VER > 2020
-// # define __cccl_lib_adaptor_iterator_pair_constructor    202106L
-// # define __cccl_lib_allocate_at_least                    202106L
-// # define __cccl_lib_associative_heterogeneous_erasure    202110L
-// # define __cccl_lib_bind_back                            202202L
-// # define __cccl_lib_byteswap                             202110L
-// # define __cccl_lib_constexpr_bitset                     202207L
-// # define __cccl_lib_constexpr_charconv                   202207L
-// # define __cccl_lib_constexpr_cmath                      202202L
-// # undef  __cccl_lib_constexpr_memory
-// # define __cccl_lib_constexpr_memory                     202202L
-// # define __cccl_lib_constexpr_typeinfo                   202106L
-#  define __cccl_lib_forward_like 202207L
-// # define __cccl_lib_invoke_r                             202106L
-#  define __cccl_lib_is_scoped_enum 202011L
-// # define __cccl_lib_move_only_function                   202110L
-// # define __cccl_lib_out_ptr                              202106L
-// # define __cccl_lib_ranges_chunk                         202202L
-// # define __cccl_lib_ranges_chunk_by                      202202L
-// # define __cccl_lib_ranges_iota                          202202L
-// # define __cccl_lib_ranges_join_with                     202202L
-// # define __cccl_lib_ranges_slide                         202202L
-// # define __cccl_lib_ranges_starts_ends_with              202106L
-// # define __cccl_lib_ranges_to_container                  202202L
-// # define __cccl_lib_ranges_zip                           202110L
-// # define __cccl_lib_reference_from_temporary             202202L
-// # define __cccl_lib_spanstream                           202106L
-// # define __cccl_lib_stacktrace                           202011L
-// # define __cccl_lib_stdatomic_h                          202011L
-// # define __cccl_lib_string_contains                      202011L
-// # define __cccl_lib_string_resize_and_overwrite          202110L
-#  define __cccl_lib_to_underlying 202102L
-#  define __cccl_lib_unreachable   202202L
-
-#endif // _CCCL_STD_VER > 2020
-
-#endif // _LIBCUDACXX_VERSIONH
diff --git a/libcudacxx/include/cuda/std/expected b/libcudacxx/include/cuda/std/expected
index 09927ec47c..213b2497e6 100644
--- a/libcudacxx/include/cuda/std/expected
+++ b/libcudacxx/include/cuda/std/expected
@@ -1,9 +1,10 @@
 //===----------------------------------------------------------------------===//
 //
-// Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions.
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -20,10 +21,11 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
-
-#include <cuda/std/detail/libcxx/include/expected>
-
-_CCCL_POP_MACROS
+#include <cuda/std/__expected/bad_expected_access.h>
+#include <cuda/std/__expected/expected.h>
+#include <cuda/std/__expected/unexpect.h>
+#include <cuda/std/__expected/unexpected.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/version>
 
 #endif //_CUDA_STD_EXPECTED
diff --git a/libcudacxx/include/cuda/std/functional b/libcudacxx/include/cuda/std/functional
index 99e65c6426..47f5806f64 100644
--- a/libcudacxx/include/cuda/std/functional
+++ b/libcudacxx/include/cuda/std/functional
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,10 +21,37 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
-
-#include <cuda/std/detail/libcxx/include/functional>
-
-_CCCL_POP_MACROS
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/__functional/binary_function.h>
+#include <cuda/std/__functional/binary_negate.h>
+#include <cuda/std/__functional/bind.h>
+#include <cuda/std/__functional/bind_back.h>
+#include <cuda/std/__functional/bind_front.h>
+#include <cuda/std/__functional/binder1st.h>
+#include <cuda/std/__functional/binder2nd.h>
+#include <cuda/std/__functional/compose.h>
+#include <cuda/std/__functional/default_searcher.h>
+#include <cuda/std/__functional/function.h>
+#include <cuda/std/__functional/hash.h>
+#include <cuda/std/__functional/identity.h>
+#include <cuda/std/__functional/invoke.h>
+#include <cuda/std/__functional/is_transparent.h>
+#include <cuda/std/__functional/mem_fn.h>
+#include <cuda/std/__functional/mem_fun_ref.h>
+#include <cuda/std/__functional/not_fn.h>
+#include <cuda/std/__functional/operations.h>
+#include <cuda/std/__functional/perfect_forward.h>
+#include <cuda/std/__functional/pointer_to_binary_function.h>
+#include <cuda/std/__functional/pointer_to_unary_function.h>
+#include <cuda/std/__functional/ranges_operations.h>
+#include <cuda/std/__functional/reference_wrapper.h>
+#include <cuda/std/__functional/unary_function.h>
+#include <cuda/std/__functional/unary_negate.h>
+#include <cuda/std/__functional/unwrap_ref.h>
+#include <cuda/std/__functional/weak_result_type.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/detail/libcxx/include/iosfwd> // for forward declarations of vector and string
+#include <cuda/std/version>
 
 #endif // _CUDA_STD_FUNCTIONAL
diff --git a/libcudacxx/include/cuda/std/initializer_list b/libcudacxx/include/cuda/std/initializer_list
index 49f22b9aba..5c98b33a8a 100644
--- a/libcudacxx/include/cuda/std/initializer_list
+++ b/libcudacxx/include/cuda/std/initializer_list
@@ -1,9 +1,10 @@
 //===----------------------------------------------------------------------===//
 //
-// Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions.
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -20,10 +21,12 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
+#if !defined(_CCCL_COMPILER_NVRTC)
+#  include <initializer_list>
+#endif // !_CCCL_COMPILER_NVRTC
 
-#include <cuda/std/detail/libcxx/include/initializer_list>
-
-_CCCL_POP_MACROS
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+using ::std::initializer_list;
+_LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _CUDA_STD_INITIALIZER_LIST
diff --git a/libcudacxx/include/cuda/std/inplace_vector b/libcudacxx/include/cuda/std/inplace_vector
new file mode 100644
index 0000000000..6c2963c8a1
--- /dev/null
+++ b/libcudacxx/include/cuda/std/inplace_vector
@@ -0,0 +1,2219 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_INPLACE_VECTOR
+#define _CUDA_STD_INPLACE_VECTOR
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2014
+
+#  include <cuda/std/__algorithm/copy.h>
+#  include <cuda/std/__algorithm/equal.h>
+#  include <cuda/std/__algorithm/fill.h>
+#  include <cuda/std/__algorithm/lexicographical_compare.h>
+#  include <cuda/std/__algorithm/move.h>
+#  include <cuda/std/__algorithm/move_backward.h>
+#  include <cuda/std/__algorithm/remove.h>
+#  include <cuda/std/__algorithm/remove_if.h>
+#  include <cuda/std/__algorithm/rotate.h>
+#  include <cuda/std/__algorithm/swap_ranges.h>
+#  include <cuda/std/__exception/terminate.h>
+#  include <cuda/std/__iterator/advance.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/distance.h>
+#  include <cuda/std/__iterator/iter_move.h>
+#  include <cuda/std/__iterator/next.h>
+#  include <cuda/std/__iterator/reverse_iterator.h>
+#  include <cuda/std/__memory/construct_at.h>
+#  include <cuda/std/__memory/uninitialized_algorithms.h>
+#  include <cuda/std/__new/bad_alloc.h>
+#  include <cuda/std/__new/launder.h>
+#  include <cuda/std/__ranges/access.h>
+#  include <cuda/std/__ranges/concepts.h>
+#  include <cuda/std/__ranges/from_range.h>
+#  include <cuda/std/__ranges/size.h>
+#  include <cuda/std/__ranges/unwrap_end.h>
+#  include <cuda/std/__type_traits/conditional.h>
+#  include <cuda/std/__type_traits/is_constant_evaluated.h>
+#  include <cuda/std/__type_traits/is_nothrow_constructible.h>
+#  include <cuda/std/__type_traits/is_nothrow_copy_assignable.h>
+#  include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
+#  include <cuda/std/__type_traits/is_nothrow_default_constructible.h>
+#  include <cuda/std/__type_traits/is_nothrow_move_assignable.h>
+#  include <cuda/std/__type_traits/is_nothrow_move_constructible.h>
+#  include <cuda/std/__type_traits/is_swappable.h>
+#  include <cuda/std/__type_traits/is_trivial.h>
+#  include <cuda/std/__type_traits/is_trivially_copy_assignable.h>
+#  include <cuda/std/__type_traits/is_trivially_copy_constructible.h>
+#  include <cuda/std/__type_traits/is_trivially_destructible.h>
+#  include <cuda/std/__type_traits/is_trivially_move_assignable.h>
+#  include <cuda/std/__type_traits/is_trivially_move_constructible.h>
+#  include <cuda/std/__utility/forward.h>
+#  include <cuda/std/__utility/move.h>
+#  include <cuda/std/cstdint>
+#  include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#  include <cuda/std/detail/libcxx/include/stdexcept>
+#  include <cuda/std/initializer_list>
+#  include <cuda/std/limits>
+
+_CCCL_PUSH_MACROS
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <size_t _Capacity>
+using __inplace_vector_size_type =
+  _If<_Capacity <= numeric_limits<uint8_t>::max(),
+      uint8_t,
+      _If<_Capacity <= numeric_limits<uint16_t>::max(),
+          uint16_t,
+          _If<_Capacity <= numeric_limits<uint32_t>::max(), uint32_t, uint64_t>>>;
+
+enum class __inplace_vector_specialization
+{
+  __empty,
+  __trivial,
+  __default,
+};
+
+template <class _Tp, size_t _Capacity>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __inplace_vector_specialization
+__select_inplace_vector_specialization()
+{
+  if (_Capacity == 0)
+  {
+    return __inplace_vector_specialization::__empty;
+  }
+  else if (_CCCL_TRAIT(is_trivial, _Tp))
+  {
+    return __inplace_vector_specialization::__trivial;
+  }
+  else
+  {
+    return __inplace_vector_specialization::__default;
+  }
+}
+
+template <class _Base>
+struct _Rollback_change_size
+{
+  using iterator = typename _Base::iterator;
+  _Base* __obj_;
+  iterator& __first_;
+  iterator __current_;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Rollback_change_size(
+    _Base* __obj, iterator& __first, iterator& __current) noexcept
+      : __obj_(__obj)
+      , __first_(__first)
+      , __current_(__current)
+  {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI void operator()() const noexcept
+  {
+    __obj_->__size_ += static_cast<typename _Base::__size_type>(__current_ - __first_);
+  }
+};
+
+template <class _Tp, size_t _Capacity, bool = _CCCL_TRAIT(is_trivially_destructible, _Tp)>
+struct __inplace_vector_destruct_base
+{
+  using size_type   = size_t;
+  using __size_type = __inplace_vector_size_type<_Capacity>;
+
+  alignas(_Tp) unsigned char __elems_[_Capacity * sizeof(_Tp)] = {};
+  __size_type __size_{0};
+
+  // [containers.sequences.inplace.vector.cons], construct/copy/destroy
+  _CCCL_HIDE_FROM_ABI __inplace_vector_destruct_base() = default;
+  _LIBCUDACXX_HIDE_FROM_ABI ~__inplace_vector_destruct_base() noexcept
+  {
+    _Tp* __begin = _CUDA_VSTD::launder(reinterpret_cast<_Tp*>(__elems_));
+    _CUDA_VSTD::__destroy(__begin, __begin + __size_);
+  }
+};
+
+template <class _Tp, size_t _Capacity>
+struct __inplace_vector_destruct_base<_Tp, _Capacity, true>
+{
+  using size_type   = size_t;
+  using __size_type = __inplace_vector_size_type<_Capacity>;
+
+  alignas(_Tp) unsigned char __elems_[_Capacity * sizeof(_Tp)] = {};
+  __size_type __size_{0};
+
+  // [containers.sequences.inplace.vector.cons], construct/copy/destroy
+  _CCCL_HIDE_FROM_ABI constexpr __inplace_vector_destruct_base() = default;
+};
+
+template <class _Tp, size_t _Capacity>
+struct __inplace_vector_storage : public __inplace_vector_destruct_base<_Tp, _Capacity>
+{
+  using size_type      = size_t;
+  using __size_type    = __inplace_vector_size_type<_Capacity>;
+  using reference      = _Tp&;
+  using iterator       = _Tp*;
+  using const_iterator = const _Tp*;
+
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_storage, __inplace_vector_destruct_base, _Tp, _Capacity);
+
+  // [containers.sequences.inplace.vector.members] size/capacity
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type size() const noexcept
+  {
+    return static_cast<size_type>(this->__size_);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const noexcept
+  {
+    return this->__size_ == 0;
+  }
+
+  // [containers.sequences.inplace.vector.data], data access
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _Tp* data() noexcept
+  {
+    return _CUDA_VSTD::launder(reinterpret_cast<_Tp*>(this->__elems_));
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI const _Tp* data() const noexcept
+  {
+    return _CUDA_VSTD::launder(reinterpret_cast<const _Tp*>(this->__elems_));
+  }
+
+  // [containers.sequences.inplace.vector.itertators] iterators
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI iterator begin() noexcept
+  {
+    return _CUDA_VSTD::launder(reinterpret_cast<_Tp*>(this->__elems_));
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI const_iterator begin() const noexcept
+  {
+    return _CUDA_VSTD::launder(reinterpret_cast<const _Tp*>(this->__elems_));
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI iterator end() noexcept
+  {
+    return _CUDA_VSTD::launder(reinterpret_cast<_Tp*>(this->__elems_) + this->__size_);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI const_iterator end() const noexcept
+  {
+    return _CUDA_VSTD::launder(reinterpret_cast<const _Tp*>(this->__elems_) + this->__size_);
+  }
+
+  // [containers.sequences.inplace.vector.modifiers], modifiers
+  template <class... _Args>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference
+  unchecked_emplace_back(_Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
+  {
+    auto __final = _CUDA_VSTD::__construct_at(end(), _CUDA_VSTD::forward<_Args>(__args)...);
+    ++this->__size_;
+    return *__final;
+  }
+
+protected:
+  _LIBCUDACXX_HIDE_FROM_ABI void __destroy(iterator __first, iterator __last) noexcept
+  {
+    _CUDA_VSTD::__destroy(__first, __last);
+    this->__size_ -= static_cast<__size_type>(__last - __first);
+  }
+
+  _LIBCUDACXX_TEMPLATE(bool _IsNothrow = _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
+  _LIBCUDACXX_REQUIRES(_IsNothrow)
+  _LIBCUDACXX_HIDE_FROM_ABI void __uninitialized_value_construct(iterator __first, iterator __last) noexcept
+  {
+    iterator __idx = __first;
+    for (; __idx != __last; ++__idx)
+    {
+      ::new (_CUDA_VSTD::__voidify(*__idx)) _Tp();
+    }
+    this->__size_ += static_cast<__size_type>(__last - __first);
+  }
+
+  _LIBCUDACXX_TEMPLATE(bool _IsNothrow = _CCCL_TRAIT(is_nothrow_default_constructible, _Tp))
+  _LIBCUDACXX_REQUIRES((!_IsNothrow))
+  _LIBCUDACXX_HIDE_FROM_ABI void __uninitialized_value_construct(iterator __first, iterator __last)
+  {
+    iterator __idx = __first;
+    auto __guard   = __make_exception_guard(_Rollback_change_size<__inplace_vector_storage>{this, __first, __idx});
+    for (; __idx != __last; ++__idx)
+    {
+      ::new (_CUDA_VSTD::__voidify(*__idx)) _Tp();
+    }
+    __guard.__complete();
+    this->__size_ += static_cast<__size_type>(__last - __first);
+  }
+
+  _LIBCUDACXX_TEMPLATE(bool _IsNothrow = _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp))
+  _LIBCUDACXX_REQUIRES(_IsNothrow)
+  _LIBCUDACXX_HIDE_FROM_ABI void __uninitialized_fill(iterator __first, iterator __last, const _Tp& __value) noexcept
+  {
+    iterator __idx = __first;
+    for (; __idx != __last; ++__idx)
+    {
+      ::new (_CUDA_VSTD::__voidify(*__idx)) _Tp(__value);
+    }
+    this->__size_ += static_cast<__size_type>(__last - __first);
+  }
+
+  _LIBCUDACXX_TEMPLATE(bool _IsNothrow = _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp))
+  _LIBCUDACXX_REQUIRES((!_IsNothrow))
+  _LIBCUDACXX_HIDE_FROM_ABI void __uninitialized_fill(iterator __first, iterator __last, const _Tp& __value)
+  {
+    iterator __idx = __first;
+    auto __guard   = __make_exception_guard(_Rollback_change_size<__inplace_vector_storage>{this, __first, __idx});
+    for (; __idx != __last; ++__idx)
+    {
+      ::new (_CUDA_VSTD::__voidify(*__idx)) _Tp(__value);
+    }
+    __guard.__complete();
+    this->__size_ += static_cast<__size_type>(__last - __first);
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter, bool _IsNothrow = _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp))
+  _LIBCUDACXX_REQUIRES(_IsNothrow)
+  _LIBCUDACXX_HIDE_FROM_ABI void __uninitialized_copy(_Iter __first, _Iter __last, iterator __dest) noexcept
+  {
+    iterator __curr = __dest;
+    for (; __first != __last; ++__curr, (void) ++__first)
+    {
+      ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(*__first);
+    }
+    this->__size_ += static_cast<__size_type>(__curr - __dest);
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter, bool _IsNothrow = _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp))
+  _LIBCUDACXX_REQUIRES((!_IsNothrow))
+  _LIBCUDACXX_HIDE_FROM_ABI void __uninitialized_copy(_Iter __first, _Iter __last, iterator __dest)
+  {
+    iterator __curr = __dest;
+    auto __guard    = __make_exception_guard(_Rollback_change_size<__inplace_vector_storage>{this, __dest, __curr});
+    for (; __first != __last; ++__curr, (void) ++__first)
+    {
+      ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(*__first);
+    }
+    __guard.__complete();
+    this->__size_ += static_cast<__size_type>(__curr - __dest);
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter, bool _IsNothrow = _CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
+  _LIBCUDACXX_REQUIRES(_IsNothrow)
+  _LIBCUDACXX_HIDE_FROM_ABI void __uninitialized_move(_Iter __first, _Iter __last, iterator __dest) noexcept
+  {
+    iterator __curr = __dest;
+    for (; __first != __last; ++__curr, (void) ++__first)
+    {
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+      ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VRANGES::iter_move(__first));
+#  else // ^^^ C++17 ^^^ / vvv C++14 vvv
+      ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VSTD::move(*__first));
+#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017
+    }
+    this->__size_ += static_cast<__size_type>(__curr - __dest);
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter, bool _IsNothrow = _CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
+  _LIBCUDACXX_REQUIRES((!_IsNothrow))
+  _LIBCUDACXX_HIDE_FROM_ABI void __uninitialized_move(_Iter __first, _Iter __last, iterator __dest)
+  {
+    iterator __curr = __dest;
+    auto __guard    = __make_exception_guard(_Rollback_change_size<__inplace_vector_storage>{this, __dest, __curr});
+    for (; __first != __last; ++__curr, (void) ++__first)
+    {
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+      ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VRANGES::iter_move(__first));
+#  else // ^^^ C++17 ^^^ / vvv C++14 vvv
+      ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VSTD::move(*__first));
+#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017
+    }
+    __guard.__complete();
+    this->__size_ += static_cast<__size_type>(__curr - __dest);
+  }
+};
+
+// * If is_trivially_copy_constructible_v<T> is true, then IV has a trivial copy constructor.
+template <class _Tp, size_t _Capacity, bool = _CCCL_TRAIT(is_trivially_copy_constructible, _Tp)>
+struct __inplace_vector_copy : __inplace_vector_storage<_Tp, _Capacity>
+{
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_copy, __inplace_vector_storage, _Tp, _Capacity);
+
+  _LIBCUDACXX_HIDE_FROM_ABI
+  __inplace_vector_copy(const __inplace_vector_copy& __other) noexcept(_CCCL_TRAIT(is_nothrow_copy_constructible, _Tp))
+      : __base()
+  {
+    this->__uninitialized_copy(__other.begin(), __other.end(), this->begin());
+  }
+
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy(__inplace_vector_copy&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy& operator=(const __inplace_vector_copy&) = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy& operator=(__inplace_vector_copy&&)      = default;
+};
+
+template <class _Tp, size_t _Capacity>
+struct __inplace_vector_copy<_Tp, _Capacity, true> : __inplace_vector_storage<_Tp, _Capacity>
+{
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_copy, __inplace_vector_storage, _Tp, _Capacity);
+
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy(const __inplace_vector_copy&)            = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy(__inplace_vector_copy&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy& operator=(const __inplace_vector_copy&) = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy& operator=(__inplace_vector_copy&&)      = default;
+};
+
+// * If is_trivially_move_constructible_v<T> is true, then IV has a trivial move constructor.
+template <class _Tp, size_t _Capacity, bool = _CCCL_TRAIT(is_trivially_move_constructible, _Tp)>
+struct __inplace_vector_move : __inplace_vector_copy<_Tp, _Capacity>
+{
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_move, __inplace_vector_copy, _Tp, _Capacity);
+
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move(const __inplace_vector_move&) = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI
+  __inplace_vector_move(__inplace_vector_move&& __other) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
+      : __base()
+  {
+    this->__uninitialized_move(__other.begin(), __other.end(), this->begin());
+  }
+
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move& operator=(const __inplace_vector_move&) = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move& operator=(__inplace_vector_move&&)      = default;
+};
+
+template <class _Tp, size_t _Capacity>
+struct __inplace_vector_move<_Tp, _Capacity, true> : __inplace_vector_copy<_Tp, _Capacity>
+{
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_move, __inplace_vector_copy, _Tp, _Capacity);
+
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move(const __inplace_vector_move&)            = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move(__inplace_vector_move&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move& operator=(const __inplace_vector_move&) = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move& operator=(__inplace_vector_move&&)      = default;
+};
+
+// * if is_trivially_copy_constructible_v<T> && is_trivially_copy_assignable_v<T> is true, then IV has a trivial copy
+// assignment operator
+template <class _Tp,
+          size_t _Capacity,
+          bool = _CCCL_TRAIT(is_trivially_copy_constructible, _Tp) && _CCCL_TRAIT(is_trivially_copy_assignable, _Tp)>
+struct __inplace_vector_copy_assign : __inplace_vector_move<_Tp, _Capacity>
+{
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_copy_assign, __inplace_vector_move, _Tp, _Capacity);
+
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy_assign(const __inplace_vector_copy_assign&) = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy_assign(__inplace_vector_copy_assign&&)      = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI __inplace_vector_copy_assign&
+  operator=(const __inplace_vector_copy_assign& __other) noexcept(
+    _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp) && _CCCL_TRAIT(is_nothrow_copy_assignable, _Tp))
+  {
+    if (__other.size() < this->size())
+    {
+      const auto __new_end = _CUDA_VSTD::copy(__other.begin(), __other.end(), this->begin());
+      this->__destroy(__new_end, this->end());
+    }
+    else
+    {
+      _CUDA_VSTD::copy(__other.begin(), __other.begin() + this->size(), this->begin());
+      this->__uninitialized_copy(__other.begin() + this->size(), __other.end(), this->end());
+    }
+    return *this;
+  }
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy_assign& operator=(__inplace_vector_copy_assign&&) = default;
+};
+
+template <class _Tp, size_t _Capacity>
+struct __inplace_vector_copy_assign<_Tp, _Capacity, true> : __inplace_vector_move<_Tp, _Capacity>
+{
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_copy_assign, __inplace_vector_move, _Tp, _Capacity);
+
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy_assign(const __inplace_vector_copy_assign&)            = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy_assign(__inplace_vector_copy_assign&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy_assign& operator=(const __inplace_vector_copy_assign&) = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_copy_assign& operator=(__inplace_vector_copy_assign&&)      = default;
+};
+
+// * if is_trivially_move_constructible_v<T> && is_trivially_move_assignable_v<T> is true, then IV has a trivial move
+// assignment operator
+template <class _Tp,
+          size_t _Capacity,
+          bool = _CCCL_TRAIT(is_trivially_move_constructible, _Tp) && _CCCL_TRAIT(is_trivially_move_assignable, _Tp)>
+struct __inplace_vector_move_assign : __inplace_vector_copy_assign<_Tp, _Capacity>
+{
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_move_assign, __inplace_vector_copy_assign, _Tp, _Capacity);
+
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move_assign(const __inplace_vector_move_assign&)            = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move_assign(__inplace_vector_move_assign&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move_assign& operator=(const __inplace_vector_move_assign&) = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI __inplace_vector_move_assign& operator=(__inplace_vector_move_assign&& __other) noexcept(
+    _CCCL_TRAIT(is_nothrow_move_constructible, _Tp) && _CCCL_TRAIT(is_nothrow_move_assignable, _Tp))
+  {
+    if (__other.size() < this->size())
+    {
+      const auto __new_end = _CUDA_VSTD::move(__other.begin(), __other.end(), this->begin());
+      this->__destroy(__new_end, this->end());
+    }
+    else
+    {
+      _CUDA_VSTD::move(__other.begin(), __other.begin() + this->size(), this->begin());
+      this->__uninitialized_move(__other.begin() + this->size(), __other.end(), this->end());
+    }
+    return *this;
+  }
+};
+
+template <class _Tp, size_t _Capacity>
+struct __inplace_vector_move_assign<_Tp, _Capacity, true> : __inplace_vector_copy_assign<_Tp, _Capacity>
+{
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_move_assign, __inplace_vector_copy_assign, _Tp, _Capacity);
+
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move_assign(const __inplace_vector_move_assign&)            = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move_assign(__inplace_vector_move_assign&&)                 = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move_assign& operator=(const __inplace_vector_move_assign&) = default;
+  _CCCL_HIDE_FROM_ABI __inplace_vector_move_assign& operator=(__inplace_vector_move_assign&&)      = default;
+};
+
+// Specialization for non-trivial types. Nothing in here can be constexpr
+template <class _Tp,
+          size_t _Capacity,
+          __inplace_vector_specialization _Spec = __select_inplace_vector_specialization<_Tp, _Capacity>()>
+struct __inplace_vector_base : __inplace_vector_move_assign<_Tp, _Capacity>
+{
+  _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__inplace_vector_base, __inplace_vector_move_assign, _Tp, _Capacity);
+};
+
+template <class _Tp, size_t _Capacity>
+struct __inplace_vector_base<_Tp, _Capacity, __inplace_vector_specialization::__trivial>
+{
+  using size_type      = size_t;
+  using __size_type    = __inplace_vector_size_type<_Capacity>;
+  using reference      = _Tp&;
+  using iterator       = _Tp*;
+  using const_iterator = const _Tp*;
+
+  _Tp __elems_[_Capacity] = {};
+  __size_type __size_{0};
+
+  // [containers.sequences.inplace.vector.cons], construct/copy/destroy
+  _CCCL_HIDE_FROM_ABI constexpr __inplace_vector_base()                                        = default;
+  _CCCL_HIDE_FROM_ABI constexpr __inplace_vector_base(const __inplace_vector_base&)            = default;
+  _CCCL_HIDE_FROM_ABI constexpr __inplace_vector_base(__inplace_vector_base&&)                 = default;
+  _CCCL_HIDE_FROM_ABI constexpr __inplace_vector_base& operator=(const __inplace_vector_base&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr __inplace_vector_base& operator=(__inplace_vector_base&&)      = default;
+
+  // [containers.sequences.inplace.vector.members] size/capacity
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type size() const noexcept
+  {
+    return static_cast<size_type>(__size_);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const noexcept
+  {
+    return __size_ == 0;
+  }
+
+  // [containers.sequences.inplace.vector.data], data access
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp* data() noexcept
+  {
+    return __elems_;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp* data() const noexcept
+  {
+    return __elems_;
+  }
+
+  // [containers.sequences.inplace.vector.itertators] iterators
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator begin() noexcept
+  {
+    return __elems_;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_iterator begin() const noexcept
+  {
+    return __elems_;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator end() noexcept
+  {
+    return __elems_ + __size_;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_iterator end() const noexcept
+  {
+    return __elems_ + __size_;
+  }
+
+  // [containers.sequences.inplace.vector.modifiers], modifiers
+  template <class... _Args>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference
+  unchecked_emplace_back(_Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
+  {
+    _Tp* __final = __elems_ + __size_;
+    *__final     = _Tp(_CUDA_VSTD::forward<_Args>(__args)...);
+    ++__size_;
+    return *__final;
+  }
+
+protected:
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __destroy(iterator __first, iterator __last) noexcept
+  {
+    __size_ -= static_cast<__size_type>(__last - __first);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_value_construct(iterator __first, iterator __last) noexcept
+  {
+    _CUDA_VSTD::fill(__first, __last, _Tp());
+    __size_ += static_cast<__size_type>(__last - __first);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void
+  __uninitialized_fill(iterator __first, iterator __last, const _Tp& __value) noexcept
+  {
+    _CUDA_VSTD::fill(__first, __last, __value);
+    __size_ += static_cast<__size_type>(__last - __first);
+  }
+
+  template <class _Iter>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_copy(_Iter __first, _Iter __last, iterator __dest) noexcept
+  {
+    _CUDA_VSTD::copy(__first, __last, __dest);
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+    __size_ += static_cast<__size_type>(_CUDA_VRANGES::distance(__first, __last));
+#  else // ^^^ C++17 ^^^ / vvv C++14 vvv
+    __size_ += static_cast<__size_type>(_CUDA_VSTD::distance(__first, __last));
+#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017
+  }
+
+  template <class _Iter>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_move(_Iter __first, _Iter __last, iterator __dest) noexcept
+  {
+    _CUDA_VSTD::copy(__first, __last, __dest);
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+    __size_ += static_cast<__size_type>(_CUDA_VRANGES::distance(__first, __last));
+#  else // ^^^ C++17 ^^^ / vvv C++14 vvv
+    __size_ += static_cast<__size_type>(_CUDA_VSTD::distance(__first, __last));
+#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017
+  }
+};
+
+// We need to specialize inplace_vector for zero capacity as in that case it is required to be empty and trivial
+template <class _Tp>
+struct __inplace_vector_base<_Tp, 0, __inplace_vector_specialization::__empty>
+{
+  using size_type      = size_t;
+  using __size_type    = __inplace_vector_size_type<0>;
+  using reference      = _Tp&;
+  using iterator       = _Tp*;
+  using const_iterator = const _Tp*;
+
+  // [containers.sequences.inplace.vector.cons], construct/copy/destroy
+  _CCCL_HIDE_FROM_ABI constexpr __inplace_vector_base() = default;
+
+  // [containers.sequences.inplace.vector.members] size/capacity
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type size() const noexcept
+  {
+    return 0;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool empty() const noexcept
+  {
+    return true;
+  }
+
+  // [containers.sequences.inplace.vector.data], data access
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp* data() noexcept
+  {
+    return nullptr;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp* data() const noexcept
+  {
+    return nullptr;
+  }
+
+  // [containers.sequences.inplace.vector.itertators] iterators
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator begin() noexcept
+  {
+    return nullptr;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_iterator begin() const noexcept
+  {
+    return nullptr;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator end() noexcept
+  {
+    return nullptr;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_iterator end() const noexcept
+  {
+    return nullptr;
+  }
+
+  // [containers.sequences.inplace.vector.modifiers], modifiers
+  template <class... _Args>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& unchecked_emplace_back(_Args&&...) noexcept
+  {
+    _LIBCUDACXX_UNREACHABLE();
+#  if defined(_CCCL_COMPILER_MSVC)
+    return *begin();
+#  endif // _CCCL_COMPILER_MSVC
+  }
+
+protected:
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __destroy(iterator, iterator) noexcept
+  {
+    _LIBCUDACXX_UNREACHABLE();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_value_construct(iterator, iterator) noexcept
+  {
+    _LIBCUDACXX_UNREACHABLE();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_fill(iterator, iterator, const _Tp&) noexcept
+  {
+    _LIBCUDACXX_UNREACHABLE();
+  }
+  template <class _Iter>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_copy(_Iter, _Iter, iterator) noexcept
+  {
+    _LIBCUDACXX_UNREACHABLE();
+  }
+  template <class _Iter>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_move(_Iter, _Iter, iterator) noexcept
+  {
+    _LIBCUDACXX_UNREACHABLE();
+  }
+};
+
+template <class _Tp, size_t _Capacity>
+class inplace_vector : __inplace_vector_base<_Tp, _Capacity>
+{
+private:
+  using __base = __inplace_vector_base<_Tp, _Capacity>;
+
+public:
+  using value_type      = _Tp;
+  using size_type       = size_t;
+  using difference_type = ptrdiff_t;
+  using pointer         = _Tp*;
+  using const_pointer   = const _Tp*;
+  using reference       = _Tp&;
+  using const_reference = const _Tp&;
+
+  using iterator       = pointer;
+  using const_iterator = const_pointer;
+
+  using reverse_iterator       = _CUDA_VSTD::reverse_iterator<iterator>;
+  using const_reverse_iterator = _CUDA_VSTD::reverse_iterator<const_iterator>;
+
+  // [containers.sequences.inplace.vector.cons], construct/copy/destroy
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector() noexcept                        = default;
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector(const inplace_vector&)            = default;
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector(inplace_vector&&)                 = default;
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector& operator=(const inplace_vector&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector& operator=(inplace_vector&&)      = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit inplace_vector(const size_type __count)
+      : __base()
+  {
+    if (__count > 0)
+    {
+      if (_Capacity < __count)
+      {
+        _CUDA_VSTD::__throw_bad_alloc();
+      }
+
+      iterator __begin = this->begin();
+      this->__uninitialized_value_construct(__begin, __begin + __count);
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(const size_type __count, const _Tp& __value)
+      : __base()
+  {
+    if (__count > 0)
+    {
+      if (_Capacity < __count)
+      {
+        _CUDA_VSTD::__throw_bad_alloc();
+      }
+
+      iterator __begin = this->begin();
+      this->__uninitialized_fill(__begin, __begin + __count, __value);
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter)
+  _LIBCUDACXX_REQUIRES(
+    __is_cpp17_input_iterator<_Iter>::value _LIBCUDACXX_AND(!__is_cpp17_forward_iterator<_Iter>::value))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(_Iter __first, _Iter __last)
+      : __base()
+  {
+    for (; __first != __last; ++__first)
+    {
+      emplace_back(*__first);
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter)
+  _LIBCUDACXX_REQUIRES(__is_cpp17_forward_iterator<_Iter>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(_Iter __first, _Iter __last)
+      : __base()
+  {
+    if (__first != __last)
+    {
+      const auto __size = static_cast<size_t>(_CUDA_VSTD::distance(__first, __last));
+      if (_Capacity < __size)
+      {
+        _CUDA_VSTD::__throw_bad_alloc();
+      }
+
+      this->__uninitialized_copy(__first, __last, this->begin());
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(initializer_list<_Tp> __ilist)
+      : __base()
+  {
+    if (__ilist.size() != 0)
+    {
+      if (_Capacity < __ilist.size())
+      {
+        _CUDA_VSTD::__throw_bad_alloc();
+      }
+
+      this->__uninitialized_copy(__ilist.begin(), __ilist.end(), this->begin());
+    }
+  }
+
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(
+    _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND(!_CUDA_VRANGES::forward_range<_Range>))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(from_range_t, _Range&& __range)
+      : __base()
+  {
+    auto __first = _CUDA_VRANGES::begin(__range);
+    auto __last  = _CUDA_VRANGES::end(__range);
+    for (; __first != __last; ++__first)
+    {
+      emplace_back(_CUDA_VRANGES::iter_move(__first));
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND
+                         _CUDA_VRANGES::forward_range<_Range> _LIBCUDACXX_AND _CUDA_VRANGES::sized_range<_Range>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(from_range_t, _Range&& __range)
+      : __base()
+  {
+    const auto __size = _CUDA_VRANGES::size(__range);
+    if (__size > 0)
+    {
+      if (_Capacity < __size)
+      {
+        _CUDA_VSTD::__throw_bad_alloc();
+      }
+
+      this->__uninitialized_move(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::__unwrap_end(__range), this->begin());
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND
+                         _CUDA_VRANGES::forward_range<_Range> _LIBCUDACXX_AND(!_CUDA_VRANGES::sized_range<_Range>))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(from_range_t, _Range&& __range)
+      : __base()
+  {
+    const auto __count =
+      static_cast<size_t>(_CUDA_VRANGES::distance(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::end(__range)));
+    if (__count > 0)
+    {
+      if (_Capacity < __count)
+      {
+        _CUDA_VSTD::__throw_bad_alloc();
+      }
+
+      this->__uninitialized_move(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::__unwrap_end(__range), this->begin());
+    }
+  }
+#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector& operator=(initializer_list<_Tp> __ilist)
+  {
+    const auto __count = __ilist.size();
+    if (_Capacity < __count)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+
+    const auto __size = this->size();
+    if (__count < __size)
+    {
+      const iterator __new_end = _CUDA_VSTD::copy(__ilist.begin(), __ilist.end(), this->begin());
+      this->__destroy(__new_end, this->end());
+    }
+    else
+    {
+      _CUDA_VSTD::copy(__ilist.begin(), __ilist.begin() + __size, this->begin());
+      this->__uninitialized_copy(__ilist.begin() + __size, __ilist.end(), this->end());
+    }
+    return *this;
+  }
+
+  // inplace_vector.assign
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign(const size_type __count, const _Tp& __value)
+  {
+    if (_Capacity < __count)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+
+    const iterator __begin = this->begin();
+    const iterator __end   = this->end();
+    if (__count < this->size())
+    {
+      _CUDA_VSTD::fill(__begin, __begin + __count, __value);
+      this->__destroy(__begin + __count, __end);
+    }
+    else
+    {
+      _CUDA_VSTD::fill(__begin, __end, __value);
+      this->__uninitialized_fill(__end, __begin + __count, __value);
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter)
+  _LIBCUDACXX_REQUIRES(
+    __is_cpp17_input_iterator<_Iter>::value _LIBCUDACXX_AND(!__is_cpp17_forward_iterator<_Iter>::value))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign(_Iter __first, _Iter __last)
+  {
+    iterator __end = this->end();
+    for (iterator __current = this->begin(); __current != __end; ++__current, (void) ++__first)
+    {
+      if (__first == __last)
+      {
+        this->__destroy(__current, __end);
+        return;
+      }
+      *__current = *__first;
+    }
+
+    for (; __first != __last; ++__first)
+    {
+      emplace_back(*__first);
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter)
+  _LIBCUDACXX_REQUIRES(__is_cpp17_forward_iterator<_Iter>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign(_Iter __first, _Iter __last)
+  {
+    const auto __count = static_cast<size_type>(_CUDA_VSTD::distance(__first, __last));
+    if (_Capacity < __count)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+
+    if (__count < this->size())
+    {
+      const iterator __new_end = _CUDA_VSTD::copy(__first, __last, this->begin());
+      this->__destroy(__new_end, this->end());
+    }
+    else
+    {
+      _Iter __middle = _CUDA_VSTD::next(__first, this->size());
+      _CUDA_VSTD::copy(__first, __middle, this->begin());
+      this->__uninitialized_copy(__middle, __last, this->end());
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign(initializer_list<_Tp> __ilist)
+  {
+    const auto __count = static_cast<size_type>(__ilist.size());
+    if (_Capacity < __count)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+
+    if (__count < this->size())
+    {
+      const iterator __new_end = _CUDA_VSTD::copy(__ilist.begin(), __ilist.end(), this->begin());
+      this->__destroy(__new_end, this->end());
+    }
+    else
+    {
+      _CUDA_VSTD::copy(__ilist.begin(), __ilist.begin() + this->size(), this->begin());
+      this->__uninitialized_copy(__ilist.begin() + this->size(), __ilist.end(), this->end());
+    }
+  }
+
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(
+    _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND(!_CUDA_VRANGES::forward_range<_Range>))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign_range(_Range&& __range)
+  {
+    auto __first      = _CUDA_VRANGES::begin(__range);
+    const auto __last = _CUDA_VRANGES::end(__range);
+    iterator __end    = this->end();
+    for (iterator __current = this->begin(); __current != __end; ++__current, (void) ++__first)
+    {
+      if (__first == __last)
+      {
+        this->__destroy(__current, __end);
+        return;
+      }
+      *__current = *__first;
+    }
+
+    for (; __first != __last; ++__first)
+    {
+      emplace_back(*__first);
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND
+                         _CUDA_VRANGES::forward_range<_Range> _LIBCUDACXX_AND _CUDA_VRANGES::sized_range<_Range>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign_range(_Range&& __range)
+  {
+    const auto __size = _CUDA_VRANGES::size(__range);
+    if (_Capacity < __size)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+
+    const auto __first = _CUDA_VRANGES::begin(__range);
+    const auto __last  = _CUDA_VRANGES::__unwrap_end(__range);
+    if (static_cast<size_type>(__size) < this->size())
+    {
+      const iterator __new_end = _CUDA_VSTD::copy(__first, __last, this->begin());
+      this->__destroy(__new_end, this->end());
+    }
+    else
+    {
+      const auto __middle = _CUDA_VSTD::next(__first, this->size());
+      _CUDA_VSTD::copy(__first, __middle, this->begin());
+      this->__uninitialized_copy(__middle, __last, this->end());
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND
+                         _CUDA_VRANGES::forward_range<_Range> _LIBCUDACXX_AND(!_CUDA_VRANGES::sized_range<_Range>))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign_range(_Range&& __range)
+  {
+    const auto __first = _CUDA_VRANGES::begin(__range);
+    const auto __last  = _CUDA_VRANGES::__unwrap_end(__range);
+    const auto __size  = static_cast<size_type>(_CUDA_VRANGES::distance(__first, __last));
+    if (_Capacity < __size)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+
+    if (__size < this->size())
+    {
+      const iterator __new_end = _CUDA_VSTD::copy(__first, __last, this->begin());
+      this->__destroy(__new_end, this->end());
+    }
+    else
+    {
+      const auto __middle = _CUDA_VSTD::next(__first, this->size());
+      _CUDA_VSTD::copy(__first, __middle, this->begin());
+      this->__uninitialized_copy(__middle, __last, this->end());
+    }
+  }
+#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+
+  // [containers.sequences.inplace.vector.access], element access
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(const size_type __pos)
+  {
+    if (__pos > this->size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("inplace_vector::at");
+    }
+    return *(this->begin() + __pos);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference at(const size_type __pos) const
+  {
+    if (__pos > this->size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("inplace_vector::at");
+    }
+    return *(this->begin() + __pos);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference operator[](const size_type __pos) noexcept
+  {
+    return *(this->begin() + __pos);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference operator[](const size_type __pos) const noexcept
+  {
+    return *(this->begin() + __pos);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference front() noexcept
+  {
+    return *this->begin();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference front() const noexcept
+  {
+    return *this->begin();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference back() noexcept
+  {
+    return *(this->end() - 1);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference back() const noexcept
+  {
+    return *(this->end() - 1);
+  }
+
+  using __base::data;
+
+  // inplace_vector.iterators
+  using __base::begin;
+  using __base::end;
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reverse_iterator rbegin() noexcept
+  {
+    return reverse_iterator{this->end()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reverse_iterator rbegin() const noexcept
+  {
+    return const_reverse_iterator{this->end()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reverse_iterator rend() noexcept
+  {
+    return reverse_iterator{this->begin()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reverse_iterator rend() const noexcept
+  {
+    return const_reverse_iterator{this->begin()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_iterator cbegin() const noexcept
+  {
+    return this->begin();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_iterator cend() const noexcept
+  {
+    return this->end();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reverse_iterator crbegin() const noexcept
+  {
+    return const_reverse_iterator{this->end()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reverse_iterator crend() const noexcept
+  {
+    return const_reverse_iterator{this->begin()};
+  }
+
+  // [containers.sequences.inplace.vector.members] size/capacity
+  using __base::empty;
+  using __base::size;
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type max_size() const noexcept
+  {
+    return _Capacity;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type capacity() const noexcept
+  {
+    return _Capacity;
+  }
+
+  // [containers.sequences.inplace.vector.modifiers], modifiers
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator __cpos, const _Tp& __value)
+  {
+    return emplace(__cpos, __value);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator __cpos, _Tp&& __value)
+  {
+    return emplace(__cpos, _CUDA_VSTD::move(__value));
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator __cpos, const size_type __count, const _Tp& __value)
+  {
+    const auto __pos = static_cast<size_type>(__cpos - this->cbegin());
+    if (__count > _Capacity - this->size())
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    else if (__pos > this->size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("inplace_vector::insert(const_iterator, size_type, T)");
+    }
+
+    const iterator __first = this->begin() + __pos;
+    if (__count == 0)
+    {
+      return __first;
+    }
+
+    const iterator __end = this->end();
+    if (__pos == this->size())
+    {
+      this->__uninitialized_fill(__end, __end + __count, __value);
+      return __first;
+    }
+
+    const iterator __middle = __first + __count;
+    if (__end <= __middle)
+    { // all existing elements are pushed into uninitialized storage
+      this->__uninitialized_fill(__end, __middle, __value);
+      this->__uninitialized_move(__first, __end, __middle);
+      _CUDA_VSTD::fill(__first, __end, __value);
+    }
+    else
+    { // some elements get copied into existing storage
+      this->__uninitialized_move(__end - __count, __end, __end);
+      _CUDA_VSTD::move_backward(__first, __end - __count, __end);
+      _CUDA_VSTD::fill(__first, __middle, __value);
+    }
+
+    return __first;
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter)
+  _LIBCUDACXX_REQUIRES(
+    __is_cpp17_input_iterator<_Iter>::value _LIBCUDACXX_AND(!__is_cpp17_forward_iterator<_Iter>::value))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator __cpos, _Iter __first, _Iter __last)
+  {
+    // add all new elements to the back then rotate
+    const iterator __old_end = this->end();
+    for (; __first != __last; ++__first)
+    {
+      emplace_back(*__first);
+    }
+
+    const iterator __res = this->begin() + (__cpos - this->cbegin());
+    _CUDA_VSTD::rotate(__res, __old_end, this->end());
+    return __res;
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter)
+  _LIBCUDACXX_REQUIRES(__is_cpp17_forward_iterator<_Iter>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator __cpos, _Iter __first, _Iter __last)
+  {
+    const auto __pos   = static_cast<size_type>(__cpos - this->cbegin());
+    const auto __count = static_cast<size_type>(_CUDA_VSTD::distance(__first, __last));
+    if (__count > _Capacity - this->size())
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    else if (__pos > this->size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("inplace_vector::insert(const_iterator, Iter, Iter)");
+    }
+
+    const iterator __res = this->begin() + __pos;
+    if (__count == 0)
+    {
+      return __res;
+    }
+
+    const iterator __end = this->end();
+    if (__pos == this->size())
+    {
+      this->__uninitialized_copy(__first, __last, __end);
+      return __res;
+    }
+
+    const iterator __middle = __res + __count;
+    if (__end <= __middle)
+    { // all existing elements are pushed into uninitialized storage
+      _Iter __imiddle = _CUDA_VSTD::next(__first, this->size() - __pos);
+      this->__uninitialized_copy(__imiddle, __last, __end);
+      this->__uninitialized_move(__res, __end, __middle);
+      _CUDA_VSTD::copy(__first, __imiddle, __res);
+    }
+    else
+    { // all new elements get copied into existing storage
+      this->__uninitialized_move(__end - __count, __end, __end);
+      _CUDA_VSTD::move_backward(__res, __end - __count, __end);
+      _CUDA_VSTD::copy(__first, __last, __res);
+    }
+
+    return __res;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator __cpos, initializer_list<_Tp> __ilist)
+  {
+    const auto __pos   = static_cast<size_type>(__cpos - this->cbegin());
+    const auto __count = __ilist.size();
+    if (__count > _Capacity - this->size())
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    else if (__pos > this->size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("inplace_vector::insert(const_iterator, initializer_list)");
+    }
+
+    const iterator __res = this->begin() + __pos;
+    if (__count == 0)
+    {
+      return __res;
+    }
+
+    const iterator __end = this->end();
+    if (__pos == this->size())
+    {
+      this->__uninitialized_copy(__ilist.begin(), __ilist.end(), __end);
+      return __res;
+    }
+
+    const iterator __middle = __res + __count;
+    if (__end <= __middle)
+    { // all existing elements are pushed into uninitialized storage
+      auto __imiddel = __ilist.begin() + this->size() - __pos;
+      this->__uninitialized_copy(__imiddel, __ilist.end(), __end);
+      this->__uninitialized_move(__res, __end, __middle);
+      _CUDA_VSTD::copy(__ilist.begin(), __imiddel, __res);
+    }
+    else
+    { // all new elements get copied into existing storage
+      this->__uninitialized_move(__end - __count, __end, __end);
+      _CUDA_VSTD::move_backward(__res, __end - __count, __end);
+      _CUDA_VSTD::copy(__ilist.begin(), __ilist.end(), __res);
+    }
+
+    return __res;
+  }
+
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(
+    _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND(!_CUDA_VRANGES::forward_range<_Range>))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert_range(const_iterator __cpos, _Range&& __range)
+  {
+    // add all new elements to the back then rotate
+    auto __first             = _CUDA_VRANGES::begin(__range);
+    auto __last              = _CUDA_VRANGES::end(__range);
+    const iterator __old_end = this->end();
+    for (; __first != __last; ++__first)
+    {
+      emplace_back(*__first);
+    }
+
+    const auto __pos = this->begin() + static_cast<size_type>(__cpos - this->cbegin());
+    _CUDA_VSTD::rotate(__pos, __old_end, this->end());
+    return __pos;
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(
+    _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND _CUDA_VRANGES::forward_range<_Range>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert_range(const_iterator __cpos, _Range&& __range)
+  {
+    auto __first = _CUDA_VRANGES::begin(__range);
+    return insert(__cpos, __first, _CUDA_VRANGES::__unwrap_end(__range));
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(
+    _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND(!_CUDA_VRANGES::forward_range<_Range>))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void append_range(_Range&& __range)
+  {
+    auto __first = _CUDA_VRANGES::begin(__range);
+    auto __last  = _CUDA_VRANGES::end(__range);
+    for (; __first != __last; ++__first)
+    {
+      emplace_back(*__first);
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(
+    _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND _CUDA_VRANGES::forward_range<_Range>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void append_range(_Range&& __range)
+  {
+    auto __first = _CUDA_VRANGES::begin(__range);
+    insert(this->end(), __first, _CUDA_VRANGES::__unwrap_end(__range));
+  }
+#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+
+  template <class... _Args>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator emplace(const_iterator __cpos, _Args&&... __args)
+  {
+    const auto __pos = static_cast<size_type>(__cpos - this->cbegin());
+    if (this->size() == _Capacity)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    else if (__pos > this->size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("inplace_vector::emplace(const_iterator, Args...)");
+    }
+
+    const iterator __res = this->begin() + __pos;
+    if (__pos == this->size())
+    {
+      this->unchecked_emplace_back(_CUDA_VSTD::forward<_Args>(__args)...);
+      return __res;
+    }
+
+    const iterator __end = this->end();
+    _Tp __temp{_CUDA_VSTD::forward<_Args>(__args)...};
+    this->unchecked_emplace_back(_CUDA_VSTD::move(*(__end - 1)));
+    _CUDA_VSTD::move_backward(__res, __end - 1, __end);
+    *__res = _CUDA_VSTD::move(__temp);
+
+    return __res;
+  }
+
+  template <class... _Args>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference emplace_back(_Args&&... __args)
+  {
+    if (this->size() == _Capacity)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+
+    return this->unchecked_emplace_back(_CUDA_VSTD::forward<_Args>(__args)...);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference push_back(const _Tp& __value)
+  {
+    if (this->size() == _Capacity)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+
+    return this->unchecked_emplace_back(__value);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference push_back(_Tp&& __value)
+  {
+    if (this->size() == _Capacity)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+
+    return this->unchecked_emplace_back(_CUDA_VSTD::move(__value));
+  }
+
+  template <class... _Args>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer
+  try_emplace_back(_Args&&... __args) noexcept(_CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
+  {
+    if (this->size() == _Capacity)
+    {
+      return nullptr;
+    }
+
+    return _CUDA_VSTD::addressof(this->unchecked_emplace_back(_CUDA_VSTD::forward<_Args>(__args)...));
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer
+  try_push_back(const _Tp& __value) noexcept(_CCCL_TRAIT(is_nothrow_copy_constructible, _Tp))
+  {
+    if (this->size() == _Capacity)
+    {
+      return nullptr;
+    }
+
+    return _CUDA_VSTD::addressof(this->unchecked_emplace_back(__value));
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer
+  try_push_back(_Tp&& __value) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
+  {
+    if (this->size() == _Capacity)
+    {
+      return nullptr;
+    }
+
+    return _CUDA_VSTD::addressof(this->unchecked_emplace_back(_CUDA_VSTD::move(__value)));
+  }
+
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(
+    _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND(!_CUDA_VRANGES::forward_range<_Range>))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VRANGES::iterator_t<_Range>
+  try_append_range(_Range&& __range) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
+  {
+    auto __first = _CUDA_VRANGES::begin(__range);
+    auto __last  = _CUDA_VRANGES::end(__range);
+    for (; this->size() != _Capacity && __first != __last; ++__first)
+    {
+      emplace_back(*__first);
+    }
+    return __first;
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND
+                         _CUDA_VRANGES::forward_range<_Range> _LIBCUDACXX_AND _CUDA_VRANGES::sized_range<_Range>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VRANGES::iterator_t<_Range>
+  try_append_range(_Range&& __range) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
+  {
+    const auto __capacity = _Capacity - this->size();
+    const auto __size     = _CUDA_VRANGES::size(__range);
+    const auto __diff     = __size < __capacity ? __size : __capacity;
+
+    auto __first  = _CUDA_VRANGES::begin(__range);
+    auto __middle = _CUDA_VRANGES::next(__first, __diff);
+    this->__uninitialized_move(__first, __middle, this->end());
+    return __middle;
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _LIBCUDACXX_AND
+                         _CUDA_VRANGES::forward_range<_Range> _LIBCUDACXX_AND(!_CUDA_VRANGES::sized_range<_Range>))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VRANGES::iterator_t<_Range>
+  try_append_range(_Range&& __range) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
+  {
+    const auto __capacity = static_cast<ptrdiff_t>(_Capacity - this->size());
+    auto __first          = _CUDA_VRANGES::begin(__range);
+    const auto __size = static_cast<ptrdiff_t>(_CUDA_VRANGES::distance(__first, _CUDA_VRANGES::__unwrap_end(__range)));
+    const ptrdiff_t __diff = __size < __capacity ? __size : __capacity;
+
+    auto __middle = _CUDA_VRANGES::next(__first, __diff);
+    this->__uninitialized_move(__first, __middle, this->end());
+    return __middle;
+  }
+#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+
+  using __base::unchecked_emplace_back;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference
+  unchecked_push_back(const _Tp& __value) noexcept(_CCCL_TRAIT(is_nothrow_copy_constructible, _Tp))
+  {
+    return this->unchecked_emplace_back(__value);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference
+  unchecked_push_back(_Tp&& __value) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Tp))
+  {
+    return this->unchecked_emplace_back(_CUDA_VSTD::move(__value));
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void pop_back() noexcept
+  {
+    const auto __end = this->end();
+    this->__destroy(__end - 1, __end);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator
+  erase(const_iterator __cpos) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _Tp))
+  {
+    const auto __pos = static_cast<size_type>(__cpos - this->cbegin());
+    if (__pos > this->size())
+    {
+      _CUDA_VSTD_NOVERSION::terminate();
+    }
+
+    const iterator __res = this->begin() + __pos;
+    if (__pos == this->size())
+    {
+      return __res;
+    }
+
+    const iterator __end = this->end();
+    _CUDA_VSTD::move(__res + 1, __end, __res);
+    this->__destroy(__end - 1, __end);
+    return __res;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator
+  erase(const_iterator __cfirst, const_iterator __clast) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _Tp))
+  {
+    const iterator __first = (iterator) __cfirst;
+    const iterator __last  = (iterator) __clast;
+    const iterator __end   = this->end();
+    if (__first == __last)
+    {
+      return __last;
+    }
+
+    if (__first < this->begin() || __end < __last)
+    {
+      _CUDA_VSTD_NOVERSION::terminate();
+    }
+
+    const auto __new_end = _CUDA_VSTD::move(__last, __end, __first);
+    this->__destroy(__new_end, __end);
+    return __first;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void clear() noexcept
+  {
+    this->__destroy(this->begin(), this->end());
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void resize(const size_type __count)
+  {
+    const auto __diff = static_cast<ptrdiff_t>(__count) - static_cast<ptrdiff_t>(this->size());
+    if (__diff == 0)
+    {
+      return;
+    }
+    else if (__diff < 0)
+    {
+      this->__destroy(this->begin() + __count, this->end());
+    }
+    else
+    {
+      if (_Capacity < __count)
+      {
+        _CUDA_VSTD::__throw_bad_alloc();
+      }
+
+      const iterator __end = this->end();
+      this->__uninitialized_value_construct(__end, __end + __diff);
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void resize(const size_type __count, const _Tp& __value)
+  {
+    const auto __diff = static_cast<ptrdiff_t>(__count) - static_cast<ptrdiff_t>(this->size());
+    if (__diff == 0)
+    {
+      return;
+    }
+    else if (__diff < 0)
+    {
+      this->__destroy(this->begin() + __count, this->end());
+    }
+    else
+    {
+      if (_Capacity < __count)
+      {
+        _CUDA_VSTD::__throw_bad_alloc();
+      }
+
+      const iterator __end = this->end();
+      this->__uninitialized_fill(__end, __end + __diff, __value);
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr void reserve(const size_type __count)
+  {
+    if (_Capacity < __count)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr void shrink_to_fit() noexcept {}
+
+  _LIBCUDACXX_TEMPLATE(class _Tp2 = _Tp)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_swappable, _Tp2) _LIBCUDACXX_AND _CCCL_TRAIT(is_move_constructible, _Tp2))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(inplace_vector& __other) noexcept(
+    _CCCL_TRAIT(is_nothrow_swappable, _Tp2) && _CCCL_TRAIT(is_nothrow_move_constructible, _Tp2))
+  {
+    if (this->size() < __other.size())
+    {
+      const auto __new_mid = _CUDA_VSTD::swap_ranges(this->begin(), this->end(), __other.begin());
+      this->__uninitialized_move(__new_mid, __other.end(), this->end());
+      __other.__destroy(__new_mid, __other.end());
+    }
+    else
+    {
+      const auto __new_mid = _CUDA_VSTD::swap_ranges(__other.begin(), __other.end(), this->begin());
+      __other.__uninitialized_move(__new_mid, this->end(), __other.end());
+      this->__destroy(__new_mid, this->end());
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Tp2 = _Tp)
+  _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(is_swappable, _Tp2) _LIBCUDACXX_AND _CCCL_TRAIT(is_move_constructible, _Tp2))
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr void swap(inplace_vector& __lhs, inplace_vector& __rhs) noexcept(
+    _Capacity == 0 || (_CCCL_TRAIT(is_nothrow_swappable, _Tp2) && _CCCL_TRAIT(is_nothrow_move_constructible, _Tp2)))
+  {
+    __lhs.swap(__rhs);
+  }
+
+  // inplace_vector.comparison
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator==(const inplace_vector& __lhs, const inplace_vector& __rhs) noexcept(
+    noexcept(_CUDA_VSTD::equal(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end())))
+  {
+    return _CUDA_VSTD::equal(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end());
+  }
+#  if _CCCL_STD_VER <= 2017
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator!=(const inplace_vector& __lhs, const inplace_vector& __rhs) noexcept(
+    noexcept(_CUDA_VSTD::equal(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end())))
+  {
+    return !_CUDA_VSTD::equal(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end());
+  }
+#  endif // _CCCL_STD_VER <= 2017
+
+#  ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr __synth_three_way_result_t<_Tp>
+  operator<=>(const inplace_vector& __lhs, const inplace_vector& __rhs) noexcept(
+    noexcept(_CUDA_VSTD::lexicographical_compare_three_way(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end())))
+  {
+    return _CUDA_VSTD::lexicographical_compare_three_way(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end());
+  }
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR ^^^ / vvv _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR vvv
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator<(const inplace_vector& __lhs, const inplace_vector& __rhs) noexcept(
+    noexcept(_CUDA_VSTD::lexicographical_compare(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end())))
+  {
+    return _CUDA_VSTD::lexicographical_compare(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end());
+  }
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator>(const inplace_vector& __lhs, const inplace_vector& __rhs) noexcept(
+    noexcept(_CUDA_VSTD::lexicographical_compare(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end())))
+  {
+    return __rhs < __lhs;
+  }
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator<=(const inplace_vector& __lhs, const inplace_vector& __rhs) noexcept(
+    noexcept(_CUDA_VSTD::lexicographical_compare(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end())))
+  {
+    return !(__rhs < __lhs);
+  }
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator>=(const inplace_vector& __lhs, const inplace_vector& __rhs) noexcept(
+    noexcept(_CUDA_VSTD::lexicographical_compare(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end())))
+  {
+    return !(__lhs < __rhs);
+  }
+#  endif // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+
+  // [containers.sequences.inplace.vector.erasure]
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type
+  __erase(const _Tp& __value) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _Tp))
+  {
+    const iterator __old_end = this->end();
+    const iterator __new_end = _CUDA_VSTD::remove(this->begin(), __old_end, __value);
+    this->__destroy(__new_end, __old_end);
+    return static_cast<size_type>(__old_end - __new_end);
+  }
+
+  template <class _Pred>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type
+  __erase_if(_Pred __pred) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _Tp))
+  {
+    const iterator __old_end = this->end();
+    const iterator __new_end = _CUDA_VSTD::remove_if(this->begin(), __old_end, _CUDA_VSTD::move(__pred));
+    this->__destroy(__new_end, __old_end);
+    return static_cast<size_type>(__old_end - __new_end);
+  }
+};
+
+template <class _Tp>
+class inplace_vector<_Tp, 0> : __inplace_vector_base<_Tp, 0>
+{
+private:
+  using __base = __inplace_vector_base<_Tp, 0>;
+
+public:
+  using value_type      = _Tp;
+  using size_type       = size_t;
+  using difference_type = ptrdiff_t;
+  using pointer         = _Tp*;
+  using const_pointer   = const _Tp*;
+  using reference       = _Tp&;
+  using const_reference = const _Tp&;
+
+  using iterator       = pointer;
+  using const_iterator = const_pointer;
+
+  using reverse_iterator       = _CUDA_VSTD::reverse_iterator<iterator>;
+  using const_reverse_iterator = _CUDA_VSTD::reverse_iterator<const_iterator>;
+
+  // [containers.sequences.inplace.vector.cons], construct/copy/destroy
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector() noexcept                        = default;
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector(const inplace_vector&)            = default;
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector(inplace_vector&&)                 = default;
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector& operator=(const inplace_vector&) = default;
+  _CCCL_HIDE_FROM_ABI constexpr inplace_vector& operator=(inplace_vector&&)      = default;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit inplace_vector(const size_type __count)
+      : __base()
+  {
+    if (__count > 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(const size_type __count, const _Tp& __value)
+      : __base()
+  {
+    if (__count > 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter)
+  _LIBCUDACXX_REQUIRES(__is_cpp17_input_iterator<_Iter>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(_Iter __first, _Iter __last)
+      : __base()
+  {
+    if (__first != __last)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(initializer_list<_Tp> __ilist)
+      : __base()
+  {
+    if (__ilist.size() != 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(from_range_t, _Range&& __range)
+      : __base()
+  {
+    if (_CUDA_VRANGES::begin(__range) != _CUDA_VRANGES::end(__range))
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector& operator=(initializer_list<_Tp> __ilist)
+  {
+    if (__ilist.size() != 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    return *this;
+  }
+
+  // inplace_vector.assign
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign(const size_type __count, const _Tp&)
+  {
+    if (__count != 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    return;
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter)
+  _LIBCUDACXX_REQUIRES(__is_cpp17_input_iterator<_Iter>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign(_Iter __first, _Iter __last)
+  {
+    if (__first != __last)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    return;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign(initializer_list<_Tp> __ilist)
+  {
+    if (__ilist.size() != 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    return;
+  }
+
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign_range(_Range&& __range)
+  {
+    if (_CUDA_VRANGES::begin(__range) != _CUDA_VRANGES::end(__range))
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    return;
+  }
+#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+
+  // [containers.sequences.inplace.vector.access], element access
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(const size_type __pos)
+  {
+    _CUDA_VSTD::__throw_out_of_range("inplace_vector::at");
+    return *(this->begin() + __pos);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference at(const size_type __pos) const
+  {
+    _CUDA_VSTD::__throw_out_of_range("inplace_vector::at");
+    return *(this->begin() + __pos);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference operator[](const size_type __pos) noexcept
+  {
+    return *(this->begin() + __pos);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference operator[](const size_type __pos) const noexcept
+  {
+    return *(this->begin() + __pos);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference front() noexcept
+  {
+    return *this->begin();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference front() const noexcept
+  {
+    return *this->begin();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference back() noexcept
+  {
+    return *(this->end() - 1);
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reference back() const noexcept
+  {
+    return *(this->end() - 1);
+  }
+
+  using __base::data;
+
+  // inplace_vector.iterators
+  using __base::begin;
+  using __base::end;
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reverse_iterator rbegin() noexcept
+  {
+    return reverse_iterator{this->end()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reverse_iterator rbegin() const noexcept
+  {
+    return const_reverse_iterator{this->end()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reverse_iterator rend() noexcept
+  {
+    return reverse_iterator{this->begin()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reverse_iterator rend() const noexcept
+  {
+    return const_reverse_iterator{this->begin()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_iterator cbegin() const noexcept
+  {
+    return this->begin();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_iterator cend() const noexcept
+  {
+    return this->end();
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reverse_iterator crbegin() const noexcept
+  {
+    return const_reverse_iterator{this->end()};
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr const_reverse_iterator crend() const noexcept
+  {
+    return const_reverse_iterator{this->begin()};
+  }
+
+  // [containers.sequences.inplace.vector.members] size/capacity
+  using __base::empty;
+  using __base::size;
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type max_size() const noexcept
+  {
+    return 0;
+  }
+
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type capacity() const noexcept
+  {
+    return 0;
+  }
+
+  // [containers.sequences.inplace.vector.modifiers], modifiers
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator __cpos, const _Tp& __value)
+  {
+    _CUDA_VSTD::__throw_bad_alloc();
+    return emplace(__cpos, __value);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator __cpos, _Tp&& __value)
+  {
+    _CUDA_VSTD::__throw_bad_alloc();
+    return emplace(__cpos, _CUDA_VSTD::move(__value));
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator, const size_type __count, const _Tp&)
+  {
+    if (__count != 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    return nullptr;
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Iter)
+  _LIBCUDACXX_REQUIRES(__is_cpp17_input_iterator<_Iter>::value)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator, _Iter __first, _Iter __last)
+  {
+    if (__first != __last)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    return nullptr;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert(const_iterator, initializer_list<_Tp> __ilist)
+  {
+    if (__ilist.size() != 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    return nullptr;
+  }
+
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert_range(const_iterator __cpos, _Range&& __range)
+  {
+    if (_CUDA_VRANGES::begin(__range) != _CUDA_VRANGES::end(__range))
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+    return nullptr;
+  }
+
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void append_range(_Range&& __range)
+  {
+    if (_CUDA_VRANGES::begin(__range) != _CUDA_VRANGES::end(__range))
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+
+  template <class... _Args>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator emplace(const_iterator, _Args&&...)
+  {
+    _CUDA_VSTD::__throw_bad_alloc();
+    return nullptr;
+  }
+
+  template <class... _Args>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference emplace_back(_Args&&... __args)
+  {
+    _CUDA_VSTD::__throw_bad_alloc();
+    return this->unchecked_emplace_back(_CUDA_VSTD::forward<_Args>(__args)...);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference push_back(const _Tp& __value)
+  {
+    _CUDA_VSTD::__throw_bad_alloc();
+    return this->unchecked_emplace_back(__value);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference push_back(_Tp&& __value)
+  {
+    _CUDA_VSTD::__throw_bad_alloc();
+    return this->unchecked_emplace_back(_CUDA_VSTD::move(__value));
+  }
+
+  template <class... _Args>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer try_emplace_back(_Args&&...) noexcept
+  {
+    return nullptr;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer try_push_back(const _Tp&) noexcept
+  {
+    return nullptr;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer try_push_back(_Tp&&) noexcept
+  {
+    return nullptr;
+  }
+
+#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+  _LIBCUDACXX_TEMPLATE(class _Range)
+  _LIBCUDACXX_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>)
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VRANGES::iterator_t<_Range> try_append_range(_Range&& __range) noexcept
+  {
+    return _CUDA_VRANGES::begin(__range);
+  }
+#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+
+  using __base::unchecked_emplace_back;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference unchecked_push_back(const _Tp& __value) noexcept
+  {
+    _CUDA_VSTD_NOVERSION::terminate();
+    return this->unchecked_emplace_back(__value);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference unchecked_push_back(_Tp&& __value) noexcept
+  {
+    _CUDA_VSTD_NOVERSION::terminate();
+    return this->unchecked_emplace_back(__value);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void pop_back() noexcept
+  {
+    _CUDA_VSTD_NOVERSION::terminate();
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator erase(const_iterator) noexcept
+  {
+    _CUDA_VSTD_NOVERSION::terminate();
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator erase(const_iterator __cfirst, const_iterator __clast) noexcept
+  {
+    if (__cfirst != __clast)
+    {
+      _CUDA_VSTD_NOVERSION::terminate();
+    }
+    return nullptr;
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void clear() noexcept {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void resize(const size_type __count)
+  {
+    if (__count != 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void resize(const size_type __count, const _Tp&)
+  {
+    if (__count != 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr void reserve(const size_type __count)
+  {
+    if (__count != 0)
+    {
+      _CUDA_VSTD::__throw_bad_alloc();
+    }
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr void shrink_to_fit() noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(inplace_vector&) noexcept {}
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr void swap(inplace_vector&, inplace_vector&) noexcept {}
+
+  // inplace_vector.comparison
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator==(const inplace_vector&, const inplace_vector&) noexcept
+  {
+    return true;
+  }
+#  if _CCCL_STD_VER <= 2017
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator!=(const inplace_vector&, const inplace_vector&) noexcept
+  {
+    return false;
+  }
+#  endif // _CCCL_STD_VER <= 2017
+
+#  ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr __synth_three_way_result_t<_Tp>
+  operator<=>(const inplace_vector& __lhs, const inplace_vector& __rhs) noexcept(
+    noexcept(_CUDA_VSTD::lexicographical_compare_three_way(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end())))
+  {
+    return _CUDA_VSTD::lexicographical_compare_three_way(__lhs.begin(), __lhs.end(), __rhs.begin(), __rhs.end());
+  }
+#  else // ^^^ !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR ^^^ / vvv _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR vvv
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator<(const inplace_vector&, const inplace_vector&) noexcept
+  {
+    return false;
+  }
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator>(const inplace_vector&, const inplace_vector&) noexcept
+  {
+    return false;
+  }
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator<=(const inplace_vector&, const inplace_vector&) noexcept
+  {
+    return true;
+  }
+  _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
+  operator>=(const inplace_vector&, const inplace_vector&) noexcept
+  {
+    return true;
+  }
+#  endif // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+
+  // [containers.sequences.inplace.vector.erasure]
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type __erase(const _Tp&) noexcept
+  {
+    return 0;
+  }
+
+  template <class _Pred>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr size_type __erase_if(_Pred) noexcept
+  {
+    return 0;
+  }
+};
+
+// [containers.sequences.inplace.vector.erasure]
+template <class _Tp, size_t _Capacity>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr size_t
+erase(inplace_vector<_Tp, _Capacity>& __cont, const _Tp& __value) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _Tp))
+{
+  return __cont.__erase(__value);
+}
+
+template <class _Tp, size_t _Capacity, class _Pred>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr size_t
+erase_if(inplace_vector<_Tp, _Capacity>& __cont, _Pred __pred) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _Tp))
+{
+  return __cont.__erase_if(__pred);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+_CCCL_POP_MACROS
+
+#endif // _CCCL_STD_VER >= 2014
+
+#endif // _CUDA_STD_INPLACE_VECTOR
diff --git a/libcudacxx/include/cuda/std/iterator b/libcudacxx/include/cuda/std/iterator
index 033da1339f..67a6b5faeb 100644
--- a/libcudacxx/include/cuda/std/iterator
+++ b/libcudacxx/include/cuda/std/iterator
@@ -1,9 +1,10 @@
 //===----------------------------------------------------------------------===//
 //
-// Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions.
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -20,10 +21,50 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
+#include <cuda/std/__iterator/access.h>
+#include <cuda/std/__iterator/advance.h>
+#include <cuda/std/__iterator/back_insert_iterator.h>
+#include <cuda/std/__iterator/bounded_iter.h>
+#include <cuda/std/__iterator/concepts.h>
+#include <cuda/std/__iterator/data.h>
+#include <cuda/std/__iterator/default_sentinel.h>
+#include <cuda/std/__iterator/distance.h>
+#include <cuda/std/__iterator/empty.h>
+#include <cuda/std/__iterator/erase_if_container.h>
+#include <cuda/std/__iterator/front_insert_iterator.h>
+#include <cuda/std/__iterator/incrementable_traits.h>
+#include <cuda/std/__iterator/indirectly_comparable.h>
+#include <cuda/std/__iterator/insert_iterator.h>
+#include <cuda/std/__iterator/istream_iterator.h>
+#include <cuda/std/__iterator/istreambuf_iterator.h>
+#include <cuda/std/__iterator/iter_move.h>
+#include <cuda/std/__iterator/iter_swap.h>
+#include <cuda/std/__iterator/iterator.h>
+#include <cuda/std/__iterator/iterator_traits.h>
+#include <cuda/std/__iterator/mergeable.h>
+#include <cuda/std/__iterator/move_iterator.h>
+#include <cuda/std/__iterator/move_sentinel.h>
+#include <cuda/std/__iterator/next.h>
+#include <cuda/std/__iterator/ostream_iterator.h>
+#include <cuda/std/__iterator/ostreambuf_iterator.h>
+#include <cuda/std/__iterator/permutable.h>
+#include <cuda/std/__iterator/prev.h>
+#include <cuda/std/__iterator/projected.h>
+#include <cuda/std/__iterator/readable_traits.h>
+#include <cuda/std/__iterator/reverse_access.h>
+#include <cuda/std/__iterator/reverse_iterator.h>
+#include <cuda/std/__iterator/size.h>
+#include <cuda/std/__iterator/sortable.h>
+#include <cuda/std/__iterator/unreachable_sentinel.h>
+#include <cuda/std/__iterator/wrap_iter.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/detail/libcxx/include/iosfwd> // for forward declarations of vector and string.
+#include <cuda/std/version>
 
-#include <cuda/std/detail/libcxx/include/iterator>
-
-_CCCL_POP_MACROS
+// Mandated by the Standard.
+#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#  include <cuda/std/detail/libcxx/include/compare>
+#endif // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#include <cuda/std/concepts>
 
 #endif // _CUDA_STD_ITERATOR
diff --git a/libcudacxx/include/cuda/std/mdspan b/libcudacxx/include/cuda/std/mdspan
index 45cf8c7b1d..798e1a69d3 100644
--- a/libcudacxx/include/cuda/std/mdspan
+++ b/libcudacxx/include/cuda/std/mdspan
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,10 +21,18 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
-
-#include <cuda/std/detail/libcxx/include/mdspan>
-
-_CCCL_POP_MACROS
+#include <cuda/std/__mdspan/default_accessor.h>
+#include <cuda/std/__mdspan/dynamic_extent.h>
+#include <cuda/std/__mdspan/extents.h>
+#include <cuda/std/__mdspan/full_extent_t.h>
+#include <cuda/std/__mdspan/layout_left.h>
+#include <cuda/std/__mdspan/layout_right.h>
+#include <cuda/std/__mdspan/layout_stride.h>
+#include <cuda/std/__mdspan/macros.h>
+#include <cuda/std/__mdspan/mdspan.h>
+#include <cuda/std/__mdspan/static_array.h>
+#include <cuda/std/__mdspan/submdspan.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/version>
 
 #endif // _CUDA_STD_MDSPAN
diff --git a/libcudacxx/include/cuda/std/ranges b/libcudacxx/include/cuda/std/ranges
index cdaf9fd4c1..bd68edc1cf 100644
--- a/libcudacxx/include/cuda/std/ranges
+++ b/libcudacxx/include/cuda/std/ranges
@@ -1,9 +1,10 @@
 //===----------------------------------------------------------------------===//
 //
-// Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions.
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -20,10 +21,39 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
+// MSVC complains about [[msvc::no_unique_address]] prior to C++20 as a vendor extension
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_MSVC(4848)
 
-#include <cuda/std/detail/libcxx/include/ranges>
+#include <cuda/std/__ranges/access.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__ranges/dangling.h>
+#include <cuda/std/__ranges/data.h>
+#include <cuda/std/__ranges/empty.h>
+#include <cuda/std/__ranges/enable_borrowed_range.h>
+#include <cuda/std/__ranges/enable_view.h>
+#include <cuda/std/__ranges/rbegin.h>
+#include <cuda/std/__ranges/rend.h>
+#include <cuda/std/__ranges/size.h>
+#include <cuda/std/__ranges/subrange.h>
+#include <cuda/std/__ranges/view_interface.h>
+#include <cuda/std/__ranges/views.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 
-_CCCL_POP_MACROS
+// standard-mandated includes
+#include <cuda/std/version>
+
+// [ranges.syn]
+#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#  include <cuda/std/compare>
+#endif // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#include <cuda/std/initializer_list>
+#include <cuda/std/iterator>
+
+// [tuple.helper]
+#include <cuda/std/__tuple_dir/tuple_element.h>
+#include <cuda/std/__tuple_dir/tuple_size.h>
+
+_CCCL_DIAG_POP
 
 #endif //_CUDA_RANGES
diff --git a/libcudacxx/include/cuda/std/type_traits b/libcudacxx/include/cuda/std/type_traits
index e612d443b5..09729c7180 100644
--- a/libcudacxx/include/cuda/std/type_traits
+++ b/libcudacxx/include/cuda/std/type_traits
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,10 +21,146 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
-
-#include <cuda/std/detail/libcxx/include/type_traits>
-
-_CCCL_POP_MACROS
+#include <cuda/std/__algorithm/iter_swap.h>
+#include <cuda/std/__functional/identity.h>
+#include <cuda/std/__functional/invoke.h>
+#include <cuda/std/__fwd/hash.h>
+#include <cuda/std/__fwd/pair.h>
+#include <cuda/std/__memory/addressof.h>
+#include <cuda/std/__type_traits/add_const.h>
+#include <cuda/std/__type_traits/add_cv.h>
+#include <cuda/std/__type_traits/add_lvalue_reference.h>
+#include <cuda/std/__type_traits/add_pointer.h>
+#include <cuda/std/__type_traits/add_rvalue_reference.h>
+#include <cuda/std/__type_traits/add_volatile.h>
+#include <cuda/std/__type_traits/aligned_storage.h>
+#include <cuda/std/__type_traits/aligned_union.h>
+#include <cuda/std/__type_traits/alignment_of.h>
+#include <cuda/std/__type_traits/apply_cv.h>
+#include <cuda/std/__type_traits/can_extract_key.h>
+#include <cuda/std/__type_traits/common_reference.h>
+#include <cuda/std/__type_traits/common_type.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/conjunction.h>
+#include <cuda/std/__type_traits/copy_cv.h>
+#include <cuda/std/__type_traits/copy_cvref.h>
+#include <cuda/std/__type_traits/decay.h>
+#include <cuda/std/__type_traits/dependent_type.h>
+#include <cuda/std/__type_traits/disjunction.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/extent.h>
+#include <cuda/std/__type_traits/has_unique_object_representation.h>
+#include <cuda/std/__type_traits/has_virtual_destructor.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_abstract.h>
+#include <cuda/std/__type_traits/is_aggregate.h>
+#include <cuda/std/__type_traits/is_allocator.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/is_array.h>
+#include <cuda/std/__type_traits/is_assignable.h>
+#include <cuda/std/__type_traits/is_base_of.h>
+#include <cuda/std/__type_traits/is_bounded_array.h>
+#include <cuda/std/__type_traits/is_callable.h>
+#include <cuda/std/__type_traits/is_char_like_type.h>
+#include <cuda/std/__type_traits/is_class.h>
+#include <cuda/std/__type_traits/is_compound.h>
+#include <cuda/std/__type_traits/is_const.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_constructible.h>
+#include <cuda/std/__type_traits/is_convertible.h>
+#include <cuda/std/__type_traits/is_copy_assignable.h>
+#include <cuda/std/__type_traits/is_copy_constructible.h>
+#include <cuda/std/__type_traits/is_core_convertible.h>
+#include <cuda/std/__type_traits/is_default_constructible.h>
+#include <cuda/std/__type_traits/is_destructible.h>
+#include <cuda/std/__type_traits/is_empty.h>
+#include <cuda/std/__type_traits/is_enum.h>
+#include <cuda/std/__type_traits/is_final.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/is_function.h>
+#include <cuda/std/__type_traits/is_fundamental.h>
+#include <cuda/std/__type_traits/is_implicitly_default_constructible.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_literal_type.h>
+#include <cuda/std/__type_traits/is_member_function_pointer.h>
+#include <cuda/std/__type_traits/is_member_object_pointer.h>
+#include <cuda/std/__type_traits/is_member_pointer.h>
+#include <cuda/std/__type_traits/is_move_assignable.h>
+#include <cuda/std/__type_traits/is_move_constructible.h>
+#include <cuda/std/__type_traits/is_nothrow_assignable.h>
+#include <cuda/std/__type_traits/is_nothrow_constructible.h>
+#include <cuda/std/__type_traits/is_nothrow_convertible.h>
+#include <cuda/std/__type_traits/is_nothrow_copy_assignable.h>
+#include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
+#include <cuda/std/__type_traits/is_nothrow_default_constructible.h>
+#include <cuda/std/__type_traits/is_nothrow_destructible.h>
+#include <cuda/std/__type_traits/is_nothrow_move_assignable.h>
+#include <cuda/std/__type_traits/is_nothrow_move_constructible.h>
+#include <cuda/std/__type_traits/is_null_pointer.h>
+#include <cuda/std/__type_traits/is_object.h>
+#include <cuda/std/__type_traits/is_pod.h>
+#include <cuda/std/__type_traits/is_pointer.h>
+#include <cuda/std/__type_traits/is_polymorphic.h>
+#include <cuda/std/__type_traits/is_primary_template.h>
+#include <cuda/std/__type_traits/is_reference.h>
+#include <cuda/std/__type_traits/is_reference_wrapper.h>
+#include <cuda/std/__type_traits/is_referenceable.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/is_scalar.h>
+#include <cuda/std/__type_traits/is_scoped_enum.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/is_signed_integer.h>
+#include <cuda/std/__type_traits/is_standard_layout.h>
+#include <cuda/std/__type_traits/is_swappable.h>
+#include <cuda/std/__type_traits/is_trivial.h>
+#include <cuda/std/__type_traits/is_trivially_assignable.h>
+#include <cuda/std/__type_traits/is_trivially_constructible.h>
+#include <cuda/std/__type_traits/is_trivially_copy_assignable.h>
+#include <cuda/std/__type_traits/is_trivially_copy_constructible.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
+#include <cuda/std/__type_traits/is_trivially_default_constructible.h>
+#include <cuda/std/__type_traits/is_trivially_destructible.h>
+#include <cuda/std/__type_traits/is_trivially_move_assignable.h>
+#include <cuda/std/__type_traits/is_trivially_move_constructible.h>
+#include <cuda/std/__type_traits/is_unbounded_array.h>
+#include <cuda/std/__type_traits/is_union.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/__type_traits/is_valid_expansion.h>
+#include <cuda/std/__type_traits/is_void.h>
+#include <cuda/std/__type_traits/is_volatile.h>
+#include <cuda/std/__type_traits/lazy.h>
+#include <cuda/std/__type_traits/make_32_64_or_128_bit.h>
+#include <cuda/std/__type_traits/make_const_lvalue_ref.h>
+#include <cuda/std/__type_traits/make_signed.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
+#include <cuda/std/__type_traits/maybe_const.h>
+#include <cuda/std/__type_traits/nat.h>
+#include <cuda/std/__type_traits/negation.h>
+#include <cuda/std/__type_traits/promote.h>
+#include <cuda/std/__type_traits/rank.h>
+#include <cuda/std/__type_traits/remove_all_extents.h>
+#include <cuda/std/__type_traits/remove_const.h>
+#include <cuda/std/__type_traits/remove_const_ref.h>
+#include <cuda/std/__type_traits/remove_cv.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__type_traits/remove_extent.h>
+#include <cuda/std/__type_traits/remove_pointer.h>
+#include <cuda/std/__type_traits/remove_reference.h>
+#include <cuda/std/__type_traits/remove_volatile.h>
+#include <cuda/std/__type_traits/result_of.h>
+#include <cuda/std/__type_traits/type_identity.h>
+#include <cuda/std/__type_traits/type_list.h>
+#include <cuda/std/__type_traits/underlying_type.h>
+#include <cuda/std/__type_traits/void_t.h>
+#include <cuda/std/__utility/convert_to_integral.h>
+#include <cuda/std/__utility/declval.h>
+#include <cuda/std/__utility/forward.h>
+#include <cuda/std/__utility/move.h>
+#include <cuda/std/__utility/swap.h>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/version>
 
 #endif // _CUDA_STD_TYPE_TRAITS
diff --git a/libcudacxx/include/cuda/std/utility b/libcudacxx/include/cuda/std/utility
index a401e7b24b..766e3e6a33 100644
--- a/libcudacxx/include/cuda/std/utility
+++ b/libcudacxx/include/cuda/std/utility
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,10 +21,58 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__functional/binary_function.h>
+#include <cuda/std/__functional/hash.h>
+#include <cuda/std/__functional/reference_wrapper.h>
+#include <cuda/std/__functional/unary_function.h>
+#include <cuda/std/__functional/unwrap_ref.h>
+#include <cuda/std/__functional/weak_result_type.h>
+#include <cuda/std/__fwd/get.h>
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__utility/as_const.h>
+#include <cuda/std/__utility/auto_cast.h>
+#include <cuda/std/__utility/cmp.h>
+#include <cuda/std/__utility/convert_to_integral.h>
+#include <cuda/std/__utility/declval.h>
+#include <cuda/std/__utility/exchange.h>
+#include <cuda/std/__utility/forward.h>
+#include <cuda/std/__utility/forward_like.h>
+#include <cuda/std/__utility/in_place.h>
+#include <cuda/std/__utility/integer_sequence.h>
+#include <cuda/std/__utility/move.h>
+#include <cuda/std/__utility/pair.h>
+#include <cuda/std/__utility/piecewise_construct.h>
+#include <cuda/std/__utility/priority_tag.h>
+#include <cuda/std/__utility/rel_ops.h>
+#include <cuda/std/__utility/swap.h>
+#include <cuda/std/__utility/to_underlying.h>
+#include <cuda/std/__utility/unreachable.h>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/detail/libcxx/include/__debug>
+#include <cuda/std/limits>
+
+// standard-mandated includes
+#include <cuda/std/concepts>
+#include <cuda/std/version>
 
-#include <cuda/std/detail/libcxx/include/utility>
+// [utility.syn]
+#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#  include <cuda/std/compare>
+#endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#include <cuda/std/initializer_list>
 
-_CCCL_POP_MACROS
+// [tuple.helper]
+#include <cuda/std/__tuple_dir/tuple_element.h>
+#include <cuda/std/__tuple_dir/tuple_size.h>
 
 #endif // _CUDA_STD_UTILITY
diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version
index c0a9657557..df05eb3a37 100644
--- a/libcudacxx/include/cuda/std/version
+++ b/libcudacxx/include/cuda/std/version
@@ -4,14 +4,14 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef _CUDA_STD_VERSION
 #define _CUDA_STD_VERSION
 
-#include <cuda/std/detail/__config>
+#include <cuda/std/detail/__config> // IWYU pragma: export
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -21,10 +21,232 @@
 #  pragma system_header
 #endif // no system header
 
-_CCCL_PUSH_MACROS
+// We need to define our own macros to not conflict with the host stl.
+// At the same time we want bring in all feature test macros from host
+#if __has_include(<version>) // <version> should be the smallest include possible
+#  include <version>
+#elif !defined(_CCCL_COMPILER_NVRTC)
+#  include <ciso646> // otherwise go for the smallest possible header
+#endif
 
-#include <cuda/std/detail/libcxx/include/version>
+#if _CCCL_STD_VER >= 2014
+#  define __cccl_lib_bit_cast     201806L
+#  define __cccl_lib_chrono_udls  201304L
+#  define __cccl_lib_complex_udls 201309L
+#  ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED
+#    define __cccl_lib_constexpr_complex 201711L
+#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+#  define __cccl_lib_concepts          202002L
+#  define __cccl_lib_exchange_function 201304L
+#  define __cccl_lib_expected          202211L
+// # define __cccl_lib_generic_associative_lookup           201304L
+#  define __cccl_lib_integer_sequence           201304L
+#  define __cccl_lib_integral_constant_callable 201304L
+#  define __cccl_lib_is_final                   201402L
+#  define __cccl_lib_is_null_pointer            201309L
+#  define __cccl_lib_make_reverse_iterator      201402L
+// # define __cccl_lib_make_unique                          201304L
+#  define __cccl_lib_null_iterators 201304L
+#  define __cccl_lib_optional       202110L
+// # define __cccl_lib_quoted_string_io                     201304L
+#  define __cccl_lib_result_of_sfinae            201210L
+#  define __cccl_lib_robust_nonmodifying_seq_ops 201304L
+#  ifndef _LIBCUDACXX_HAS_NO_THREADS
+// #   define __cccl_lib_shared_timed_mutex                 201402L
+#  endif // !_LIBCUDACXX_HAS_NO_THREADS
+#  define __cccl_lib_span 202002L
+// # define __cccl_lib_string_udls                          201304L
+#  define __cccl_lib_transformation_trait_aliases 201304L
+#  define __cccl_lib_transparent_operators        201210L
+#  define __cccl_lib_tuple_element_t              201402L
+#  define __cccl_lib_tuples_by_type               201304L
+#endif // _CCCL_STD_VER >= 2014
 
-_CCCL_POP_MACROS
+#if _CCCL_STD_VER >= 2017
+#  ifdef _LIBCUDACXX_ADDRESSOF
+#    define __cccl_lib_addressof_constexpr 201603L
+#  endif // _LIBCUDACXX_ADDRESSOF
+// # define __cccl_lib_allocator_traits_is_always_equal     201411L
+// # define __cccl_lib_any                                  201606L
+#  define __cccl_lib_apply           201603L
+#  define __cccl_lib_array_constexpr 201603L
+#  define __cccl_lib_as_const        201510L
+#  ifndef _LIBCUDACXX_HAS_NO_THREADS
+#    define __cccl_lib_atomic_is_always_lock_free 201603L
+#  endif // _LIBCUDACXX_HAS_NO_THREADS
+#  define __cccl_lib_bind_front    201907L
+#  define __cccl_lib_bool_constant 201505L
+// # define __cccl_lib_boyer_moore_searcher                 201603L
+#  define __cccl_lib_byte   201603L
+#  define __cccl_lib_chrono 201611L
+// # define __cccl_lib_clamp                                201603L
+// # define __cccl_lib_enable_shared_from_this              201603L
+// # define __cccl_lib_execution                            201603L
+// # define __cccl_lib_filesystem                           201703L
+#  define __cccl_lib_gcd_lcm                    201606L
+#  define __cccl_lib_hardware_interference_size 201703L
+#  ifdef _LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS
+#    define __cccl_lib_has_unique_object_representations 201606L
+#  endif // _LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS
+#  define __cccl_lib_hypot 201603L
+// # define __cccl_lib_incomplete_container_elements        201505L
+#  define __cccl_lib_invoke 201411L
+#  ifndef _LIBCUDACXX_HAS_NO_IS_AGGREGATE
+#    define __cccl_lib_is_aggregate 201703L
+#  endif // _LIBCUDACXX_HAS_NO_IS_AGGREGATE
+#  define __cccl_lib_is_invocable    201703L
+#  define __cccl_lib_is_swappable    201603L
+#  define __cccl_lib_launder         201606L
+#  define __cccl_lib_logical_traits  201510L
+#  define __cccl_lib_make_from_tuple 201606L
+// # define __cccl_lib_map_try_emplace                      201411L
+// # define __cccl_lib_math_special_functions               201603L
+// # define __cccl_lib_memory_resource                      201603L
+// # define __cccl_lib_node_extract                         201606L
+// # define __cccl_lib_nonmember_container_access           201411L
+#  define __cccl_lib_not_fn 201603L
+// # define __cccl_lib_parallel_algorithm                   201603L
+// # define __cccl_lib_raw_memory_algorithms                201606L
+// # define __cccl_lib_sample                               201603L
+// # define __cccl_lib_scoped_lock                          201703L
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+// #   define __cccl_lib_shared_mutex                       201505L
+#  endif
+// # define __cccl_lib_shared_ptr_arrays                    201611L
+// # define __cccl_lib_shared_ptr_weak_type                 201606L
+// # define __cccl_lib_string_view                          201606L
+// # define __cccl_lib_to_chars                             201611L
+#  define __cccl_lib_type_trait_variable_templates 201510L
+#  define __cccl_lib_uncaught_exceptions           201411L
+#  define __cccl_lib_unordered_map_try_emplace     201411L
+#  define __cccl_lib_variant                       201606L
+#  define __cccl_lib_void_t                        201411L
+#endif // _CCCL_STD_VER >= 2017
+
+#if _CCCL_STD_VER >= 2020
+#  undef __cccl_lib_array_constexpr
+#  define __cccl_lib_array_constexpr 201811L
+// # define __cccl_lib_assume_aligned                       201811L
+#  define __cccl_lib_atomic_flag_test              201907L
+#  define __cccl_lib_atomic_float                  201711L
+#  define __cccl_lib_atomic_lock_free_type_aliases 201907L
+#  ifndef _LIBCUDACXX_HAS_NO_THREADS
+#    define __cccl_lib_atomic_ref 201806L
+#  endif // _LIBCUDACXX_HAS_NO_THREADS
+// # define __cccl_lib_atomic_shared_ptr                    201711L
+#  define __cccl_lib_atomic_value_initialization 201911L
+#  ifndef _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
+#    define __cccl_lib_atomic_wait 201907L
+#  endif // _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
+#    define __cccl_lib_barrier 201907L
+#  endif
+#  define __cccl_lib_bit_cast             201806L
+#  define __cccl_lib_bitops               201907L
+#  define __cccl_lib_bounded_array_traits 201902L
+#  ifndef _LIBCUDACXX_NO_HAS_CHAR8_T
+#    define __cccl_lib_char8_t 201811L
+#  endif // _LIBCUDACXX_NO_HAS_CHAR8_T
+// # define __cccl_lib_constexpr_algorithms                 201806L
+// # define __cccl_lib_constexpr_dynamic_alloc              201907L
+#  define __cccl_lib_constexpr_functional 201907L
+// # define __cccl_lib_constexpr_iterator                   201811L
+// # define __cccl_lib_constexpr_memory                     201811L
+// # define __cccl_lib_constexpr_misc                       201811L
+// # define __cccl_lib_constexpr_numeric                    201911L
+// # define __cccl_lib_constexpr_string                     201907L
+// # define __cccl_lib_constexpr_string_view                201811L
+// # define __cccl_lib_constexpr_swap_algorithms            201806L
+// # define __cccl_lib_constexpr_tuple                      201811L
+// # define __cccl_lib_constexpr_utility                    201811L
+// # define __cccl_lib_constexpr_vector                     201907L
+// # define __cccl_lib_coroutine                            201902L
+#  if defined(__cpp_impl_destroying_delete) && __cpp_impl_destroying_delete >= 201806L \
+    && defined(__cpp_lib_destroying_delete)
+#    define __cccl_lib_destroying_delete 201806L
+#  endif
+// # define __cccl_lib_endian                               201907L
+// # define __cccl_lib_erase_if                             201811L
+// # undef  __cccl_lib_execution
+// # define __cccl_lib_execution                            201902L
+#  if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format) && !defined(_LIBCUDACXX_HAS_NO_INCOMPLETE_FORMAT)
+// #   define __cccl_lib_format                             202106L
+#  endif
+// # define __cccl_lib_generic_unordered_lookup             201811L
+// # define __cccl_lib_int_pow2                             202002L
+// # define __cccl_lib_integer_comparison_functions         202002L
+// # define __cccl_lib_interpolate                          201902L
+#  ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED
+#    define __cccl_lib_is_constant_evaluated 201811L
+#  endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+// # define __cccl_lib_is_layout_compatible                 201907L
+#  define __cccl_lib_is_nothrow_convertible 201806L
+// # define __cccl_lib_is_pointer_interconvertible          201907L
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+// #   define __cccl_lib_jthread                            201911L
+#  endif
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
+// #   define __cccl_lib_latch                              201907L
+#  endif
+// # define __cccl_lib_list_remove_return_type              201806L
+// # define __cccl_lib_math_constants                       201907L
+// # define __cccl_lib_polymorphic_allocator                201902L
+// # define __cccl_lib_ranges                               201811L
+// # define __cccl_lib_remove_cvref                         201711L
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
+// #   define __cccl_lib_semaphore                          201907L
+#  endif
+// # undef  __cccl_lib_shared_ptr_arrays
+// # define __cccl_lib_shared_ptr_arrays                    201707L
+// # define __cccl_lib_shift                                201806L
+// # define __cccl_lib_smart_ptr_for_overwrite              202002L
+// # define __cccl_lib_source_location                      201907L
+// # define __cccl_lib_ssize                                201902L
+// # define __cccl_lib_starts_ends_with                     201711L
+// # undef  __cccl_lib_string_view
+// # define __cccl_lib_string_view                          201803L
+// # define __cccl_lib_syncbuf                              201803L
+// # define __cccl_lib_three_way_comparison                 201907L
+// # define __cccl_lib_to_address                           201711L
+// # define __cccl_lib_to_array                             201907L
+// # define __cccl_lib_type_identity                        201806L
+#  define __cccl_lib_unwrap_ref 201811L
+#endif // _CCCL_STD_VER >= 2020
+
+#if _CCCL_STD_VER >= 2023
+// # define __cccl_lib_adaptor_iterator_pair_constructor    202106L
+// # define __cccl_lib_allocate_at_least                    202106L
+// # define __cccl_lib_associative_heterogeneous_erasure    202110L
+// # define __cccl_lib_bind_back                            202202L
+// # define __cccl_lib_byteswap                             202110L
+// # define __cccl_lib_constexpr_bitset                     202207L
+// # define __cccl_lib_constexpr_charconv                   202207L
+// # define __cccl_lib_constexpr_cmath                      202202L
+// # undef  __cccl_lib_constexpr_memory
+// # define __cccl_lib_constexpr_memory                     202202L
+// # define __cccl_lib_constexpr_typeinfo                   202106L
+#  define __cccl_lib_forward_like 202207L
+// # define __cccl_lib_invoke_r                             202106L
+#  define __cccl_lib_is_scoped_enum 202011L
+// # define __cccl_lib_move_only_function                   202110L
+// # define __cccl_lib_out_ptr                              202106L
+// # define __cccl_lib_ranges_chunk                         202202L
+// # define __cccl_lib_ranges_chunk_by                      202202L
+// # define __cccl_lib_ranges_iota                          202202L
+// # define __cccl_lib_ranges_join_with                     202202L
+// # define __cccl_lib_ranges_slide                         202202L
+// # define __cccl_lib_ranges_starts_ends_with              202106L
+// # define __cccl_lib_ranges_to_container                  202202L
+// # define __cccl_lib_ranges_zip                           202110L
+// # define __cccl_lib_reference_from_temporary             202202L
+// # define __cccl_lib_spanstream                           202106L
+// # define __cccl_lib_stacktrace                           202011L
+// # define __cccl_lib_stdatomic_h                          202011L
+// # define __cccl_lib_string_contains                      202011L
+// # define __cccl_lib_string_resize_and_overwrite          202110L
+#  define __cccl_lib_to_underlying 202102L
+#  define __cccl_lib_unreachable   202202L
+
+#endif // _CCCL_STD_VER >= 2023
 
 #endif // _CUDA_STD_VERSION
diff --git a/libcudacxx/include/cuda/stream_ref b/libcudacxx/include/cuda/stream_ref
index 82c49867ae..90774b8af5 100644
--- a/libcudacxx/include/cuda/stream_ref
+++ b/libcudacxx/include/cuda/stream_ref
@@ -76,7 +76,7 @@ public:
    * https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html
    *
    */
-  stream_ref() = default;
+  _CCCL_HIDE_FROM_ABI stream_ref() = default;
 
   /**
    * \brief Constructs a `stream_ref` from a `cudaStream_t` handle.
diff --git a/libcudacxx/include/cuda/version b/libcudacxx/include/cuda/version
index c64ccb55c4..e99928f86c 100644
--- a/libcudacxx/include/cuda/version
+++ b/libcudacxx/include/cuda/version
@@ -11,6 +11,6 @@
 #ifndef _CUDA_VERSION
 #define _CUDA_VERSION
 
-#include <cuda/std/version>
+#include <cuda/std/version> // IWYU pragma: export
 
 #endif // _CUDA_VERSION
diff --git a/libcudacxx/include/nv/target b/libcudacxx/include/nv/target
index 381f5341cd..1ad75b45b2 100644
--- a/libcudacxx/include/nv/target
+++ b/libcudacxx/include/nv/target
@@ -24,8 +24,7 @@
 #  define _NV_COMPILER_CLANG_CUDA
 #endif
 
-#if (!defined(__ibmxl__)) \
-  && ((defined(__cplusplus) && __cplusplus >= 201103L) || (defined(_MSC_VER) && _MSVC_LANG >= 201103L))
+#if ((defined(__cplusplus) && __cplusplus >= 201103L) || (defined(_MSC_VER) && _MSVC_LANG >= 201103L))
 #  define _NV_TARGET_CPP11
 #endif
 
diff --git a/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake b/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake
index 0c10627c01..f9b047c2b1 100644
--- a/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake
+++ b/libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake
@@ -2,7 +2,7 @@
 include("${CMAKE_CURRENT_LIST_DIR}/libcudacxx-header-search.cmake")
 
 set(libcudacxx_VERSION_MAJOR 2)
-set(libcudacxx_VERSION_MINOR 6)
+set(libcudacxx_VERSION_MINOR 7)
 set(libcudacxx_VERSION_PATCH 0)
 set(libcudacxx_VERSION_TWEAK 0)
 
diff --git a/libcudacxx/test/libcudacxx/CMakeLists.txt b/libcudacxx/test/libcudacxx/CMakeLists.txt
index f4c3b038f8..b699b4b89f 100644
--- a/libcudacxx/test/libcudacxx/CMakeLists.txt
+++ b/libcudacxx/test/libcudacxx/CMakeLists.txt
@@ -39,9 +39,6 @@ else() # NOT LIBCUDACXX_TEST_WITH_NVRTC
   set(LIBCUDACXX_TEST_COMPILER_FLAGS "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE")
 endif()
 
-# enable exceptions in tests
-string(APPEND LIBCUDACXX_TEST_COMPILER_FLAGS " -DLIBCUDACXX_ENABLE_EXCEPTIONS")
-
 if (NOT MSVC AND NOT ${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
   set(LIBCUDACXX_WARNING_LEVEL "--compiler-options=-Wall --compiler-options=-Wextra")
 endif()
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp
index 51c4a5e830..fe983aa93d 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp
@@ -62,7 +62,7 @@ void test()
         auto* ptr = res.allocate(5, 42);
         unused(ptr);
       }
-      catch (const std::bad_alloc&)
+      catch (const std::invalid_argument&)
       {
         break;
       }
@@ -78,7 +78,7 @@ void test()
         auto* ptr = res.allocate(5, 1337);
         unused(ptr);
       }
-      catch (const std::bad_alloc&)
+      catch (const std::invalid_argument&)
       {
         break;
       }
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp
index 770e0d71d7..fcb085eaae 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp
@@ -30,7 +30,7 @@ struct resource
   {
     return nullptr;
   }
-  void deallocate(void*, size_t, size_t) {}
+  void deallocate(void*, size_t, size_t) noexcept {}
 
   bool operator==(const resource&) const
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp
index df0652d5a1..f32093a158 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp
@@ -57,7 +57,7 @@ void test(const unsigned int flag)
         auto* ptr = res.allocate(5, 42);
         unused(ptr);
       }
-      catch (const std::bad_alloc&)
+      catch (const std::invalid_argument&)
       {
         break;
       }
@@ -73,7 +73,7 @@ void test(const unsigned int flag)
         auto* ptr = res.allocate(5, 1337);
         unused(ptr);
       }
-      catch (const std::bad_alloc&)
+      catch (const std::invalid_argument&)
       {
         break;
       }
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp
index 9acc1e3813..258ebca6f7 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp
@@ -30,7 +30,7 @@ struct resource
   {
     return nullptr;
   }
-  void deallocate(void*, size_t, size_t) {}
+  void deallocate(void*, size_t, size_t) noexcept {}
 
   bool operator==(const resource&) const
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.allocate.pass.cpp
index 266ed7965a..9114fa47d5 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.allocate.pass.cpp
@@ -24,7 +24,7 @@ struct async_resource
     return &_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t)
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept
   {
     // ensure that we did get the right inputs forwarded
     _val = *static_cast<int*>(ptr);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.construction.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.construction.pass.cpp
index c6dee19660..dd5ef47cf8 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.construction.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.construction.pass.cpp
@@ -35,7 +35,7 @@ struct async_resource
     return nullptr;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t) {}
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept {}
 
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.conversion.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.conversion.pass.cpp
index 1b081bde05..ec3066cbdb 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.conversion.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.conversion.pass.cpp
@@ -41,7 +41,7 @@ struct resource
     return &_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t)
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept
   {
     // ensure that we did get the right inputs forwarded
     _val = *static_cast<int*>(ptr);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.equality.fail.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.equality.fail.cpp
index 08582c00fb..103bd5b9ac 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.equality.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.equality.fail.cpp
@@ -36,7 +36,7 @@ struct async_resource
     return nullptr;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t) {}
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept {}
 
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.equality.pass.cpp
index 3c5b992c2e..78aeafa411 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.equality.pass.cpp
@@ -36,7 +36,7 @@ struct async_resource
     return nullptr;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t) {}
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept {}
 
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.inheritance.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.inheritance.pass.cpp
index 2f5b125c78..9cbaa29a54 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.inheritance.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.inheritance.pass.cpp
@@ -30,9 +30,11 @@ struct property_without_value
 template <class... Properties>
 struct async_resource_base
 {
+  virtual ~async_resource_base() = default;
+
   virtual void* allocate(std::size_t, std::size_t) = 0;
 
-  virtual void deallocate(void* ptr, std::size_t, std::size_t) = 0;
+  virtual void deallocate(void* ptr, std::size_t, std::size_t) noexcept = 0;
 
   virtual void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) = 0;
 
@@ -73,7 +75,7 @@ struct async_resource_derived_first : public async_resource_base<Properties...>
     return &_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t) override {}
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept override {}
 
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) override
   {
@@ -114,7 +116,7 @@ struct async_resource_derived_second : public async_resource_base<Properties...>
     return &_val->_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t) override {}
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept override {}
 
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) override
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.properties.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.properties.pass.cpp
index 4db4074e26..f35ca400ed 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.properties.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.async_resource_ref/async_resource_ref.properties.pass.cpp
@@ -48,7 +48,7 @@ struct async_resource
     return nullptr;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t) {}
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept {}
 
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/async_resource.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/async_resource.pass.cpp
index dcfd09b9c2..7ed93732e9 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/async_resource.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/async_resource.pass.cpp
@@ -27,7 +27,7 @@ struct valid_resource
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return nullptr;
@@ -46,7 +46,7 @@ static_assert(cuda::mr::async_resource<valid_resource>, "");
 
 struct invalid_allocate_missing
 {
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return nullptr;
@@ -91,7 +91,7 @@ struct invalid_allocate_async_argument
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(invalid_argument, std::size_t)
   {
     return nullptr;
@@ -114,7 +114,7 @@ struct invalid_allocate_async_return
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   int allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return 42;
@@ -137,7 +137,7 @@ struct invalid_deallocate_async_argument
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return nullptr;
@@ -160,7 +160,7 @@ struct non_comparable
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return nullptr;
@@ -175,7 +175,7 @@ struct non_eq_comparable
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return nullptr;
@@ -195,7 +195,7 @@ struct non_neq_comparable
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return nullptr;
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/async_resource_with.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/async_resource_with.pass.cpp
index 78734d0cac..f9e86f1b2e 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/async_resource_with.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/async_resource_with.pass.cpp
@@ -27,7 +27,7 @@ struct valid_resource_with_property
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return nullptr;
@@ -51,7 +51,7 @@ struct valid_resource_without_property
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return nullptr;
@@ -80,7 +80,7 @@ struct resource_with_many_properties
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   void* allocate_async(std::size_t, std::size_t, cuda::stream_ref)
   {
     return nullptr;
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/resource.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/resource.pass.cpp
index bd789d9bc4..e34981f9f4 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/resource.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/resource.pass.cpp
@@ -27,7 +27,7 @@ struct valid_resource
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   bool operator==(const valid_resource&) const
   {
     return true;
@@ -45,7 +45,7 @@ struct invalid_allocate_argument
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   bool operator==(const invalid_allocate_argument&)
   {
     return true;
@@ -63,7 +63,7 @@ struct invalid_allocate_return
   {
     return 42;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   bool operator==(const invalid_allocate_return&)
   {
     return true;
@@ -81,7 +81,7 @@ struct invalid_deallocate_argument
   {
     return nullptr;
   }
-  void deallocate(void*, invalid_argument, std::size_t) {}
+  void deallocate(void*, invalid_argument, std::size_t) noexcept {}
   bool operator==(const invalid_deallocate_argument&)
   {
     return true;
@@ -99,7 +99,7 @@ struct non_comparable
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
 };
 static_assert(!cuda::mr::resource<non_comparable>, "");
 
@@ -109,7 +109,7 @@ struct non_eq_comparable
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   bool operator!=(const non_eq_comparable&)
   {
     return false;
@@ -124,7 +124,7 @@ struct non_neq_comparable
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   bool operator==(const non_neq_comparable&)
   {
     return true;
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/resource_with.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/resource_with.pass.cpp
index 386de3da32..62f2c3775a 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/resource_with.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.concepts/resource_with.pass.cpp
@@ -27,7 +27,7 @@ struct valid_resource_with_property
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   bool operator==(const valid_resource_with_property&) const
   {
     return true;
@@ -46,7 +46,7 @@ struct valid_resource_without_property
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   bool operator==(const valid_resource_without_property&) const
   {
     return true;
@@ -70,7 +70,7 @@ struct resource_with_many_properties
   {
     return nullptr;
   }
-  void deallocate(void*, std::size_t, std::size_t) {}
+  void deallocate(void*, std::size_t, std::size_t) noexcept {}
   bool operator==(const resource_with_many_properties&) const
   {
     return true;
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.allocate.pass.cpp
index 5dbcac073e..2f3a96a890 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.allocate.pass.cpp
@@ -24,7 +24,7 @@ struct resource
     return &_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t)
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept
   {
     // ensure that we did get the right inputs forwarded
     _val = *static_cast<int*>(ptr);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.construction.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.construction.pass.cpp
index 2d76172d35..0ab8529ea1 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.construction.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.construction.pass.cpp
@@ -35,7 +35,7 @@ struct resource
     return &_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t)
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept
   {
     // ensure that we did get the right inputs forwarded
     _val = *static_cast<int*>(ptr);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.conversion.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.conversion.pass.cpp
index 7e5e23d8a2..06b5f94204 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.conversion.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.conversion.pass.cpp
@@ -41,7 +41,7 @@ struct resource
     return &_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t)
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept
   {
     // ensure that we did get the right inputs forwarded
     _val = *static_cast<int*>(ptr);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.equality.fail.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.equality.fail.cpp
index 8d51bb2177..51a3cedbf5 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.equality.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.equality.fail.cpp
@@ -35,7 +35,7 @@ struct resource
     return &_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t)
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept
   {
     // ensure that we did get the right inputs forwarded
     _val = *static_cast<int*>(ptr);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.equality.pass.cpp
index e835aed39a..f1b5b2d8b5 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.equality.pass.cpp
@@ -35,7 +35,7 @@ struct resource
     return &_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t)
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept
   {
     // ensure that we did get the right inputs forwarded
     _val = *static_cast<int*>(ptr);
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.inheritance.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.inheritance.pass.cpp
index 4fcc1d5df6..ccd65e640d 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.inheritance.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.inheritance.pass.cpp
@@ -30,9 +30,11 @@ struct property_without_value
 template <class... Properties>
 struct resource_base
 {
+  virtual ~resource_base() = default;
+
   virtual void* allocate(std::size_t, std::size_t) = 0;
 
-  virtual void deallocate(void* ptr, std::size_t, std::size_t) = 0;
+  virtual void deallocate(void* ptr, std::size_t, std::size_t) noexcept = 0;
 
   bool operator==(const resource_base& other) const
   {
@@ -69,7 +71,7 @@ struct resource_derived_first : public resource_base<Properties...>
     return &_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t) override {}
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept override {}
 
   bool operator==(const resource_derived_first& other) const
   {
@@ -103,7 +105,7 @@ struct resource_derived_second : public resource_base<Properties...>
     return &_val->_val;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t) override {}
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept override {}
 
   bool operator==(const resource_derived_second& other) const
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.properties.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.properties.pass.cpp
index b4119a1227..0640dab037 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.properties.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/memory_resource.resource_ref/resource_ref.properties.pass.cpp
@@ -46,7 +46,7 @@ struct resource
     return nullptr;
   }
 
-  void deallocate(void* ptr, std::size_t, std::size_t) {}
+  void deallocate(void* ptr, std::size_t, std::size_t) noexcept {}
 
   bool operator==(const resource& other) const
   {
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp
index 3ad0ae106b..a8fff25ffa 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp
@@ -57,7 +57,7 @@ void test(const unsigned int flag)
         auto* ptr = res.allocate(5, 42);
         unused(ptr);
       }
-      catch (const std::bad_alloc&)
+      catch (const std::invalid_argument&)
       {
         break;
       }
@@ -73,7 +73,7 @@ void test(const unsigned int flag)
         auto* ptr = res.allocate(5, 1337);
         unused(ptr);
       }
-      catch (const std::bad_alloc&)
+      catch (const std::invalid_argument&)
       {
         break;
       }
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp
index 1d60ea1ecb..14fe8f67d1 100644
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp
@@ -30,7 +30,7 @@ struct resource
   {
     return nullptr;
   }
-  void deallocate(void*, size_t, size_t) {}
+  void deallocate(void*, size_t, size_t) noexcept {}
 
   bool operator==(const resource&) const
   {
diff --git a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.prop/__has_operator_addressof.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.prop/__has_operator_addressof.pass.cpp
deleted file mode 100644
index c7baf80feb..0000000000
--- a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.prop/__has_operator_addressof.pass.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++98, c++03
-
-// type_traits
-
-// extension
-
-// template <typename _Tp> struct __has_operator_addressof
-
-#include <cuda/std/type_traits>
-
-#include "test_macros.h"
-
-struct A
-{};
-
-struct B
-{
-  __host__ __device__ constexpr B* operator&() const;
-};
-
-struct D;
-
-struct C
-{
-  template <class U>
-  __host__ __device__ D operator,(U&&);
-};
-
-struct E
-{
-  __host__ __device__ constexpr C operator&() const;
-};
-
-struct F
-{};
-__host__ __device__ constexpr F* operator&(F const&)
-{
-  return nullptr;
-}
-
-struct G
-{};
-__host__ __device__ constexpr G* operator&(G&&)
-{
-  return nullptr;
-}
-
-struct H
-{};
-__host__ __device__ constexpr H* operator&(H const&&)
-{
-  return nullptr;
-}
-
-struct J
-{
-  __host__ __device__ constexpr J* operator&() const&&;
-};
-
-int main(int, char**)
-{
-  static_assert(cuda::std::__has_operator_addressof<int>::value == false, "");
-  static_assert(cuda::std::__has_operator_addressof<A>::value == false, "");
-  static_assert(cuda::std::__has_operator_addressof<B>::value == true, "");
-  static_assert(cuda::std::__has_operator_addressof<E>::value == true, "");
-  static_assert(cuda::std::__has_operator_addressof<F>::value == true, "");
-  static_assert(cuda::std::__has_operator_addressof<G>::value == true, "");
-  static_assert(cuda::std::__has_operator_addressof<H>::value == true, "");
-  static_assert(cuda::std::__has_operator_addressof<J>::value == true, "");
-
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta_base.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta_base.pass.cpp
deleted file mode 100644
index 10ca851c16..0000000000
--- a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta_base.pass.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include <cuda/std/cassert>
-#include <cuda/std/type_traits>
-
-#include "test_macros.h"
-
-struct Bomb;
-template <int N, class T = Bomb>
-struct BOOM
-{
-  using Explode = typename T::BOOMBOOM;
-};
-
-using True  = cuda::std::true_type;
-using False = cuda::std::false_type;
-
-__host__ __device__ void test_if()
-{
-  ASSERT_SAME_TYPE(cuda::std::_If<true, int, long>, int);
-  ASSERT_SAME_TYPE(cuda::std::_If<false, int, long>, long);
-}
-
-__host__ __device__ void test_and()
-{
-  static_assert(cuda::std::_And<True>::value, "");
-  static_assert(!cuda::std::_And<False>::value, "");
-  static_assert(cuda::std::_And<True, True>::value, "");
-  static_assert(!cuda::std::_And<False, BOOM<1>>::value, "");
-  static_assert(!cuda::std::_And<True, True, True, False, BOOM<2>>::value, "");
-}
-
-__host__ __device__ void test_or()
-{
-  static_assert(cuda::std::_Or<True>::value, "");
-  static_assert(!cuda::std::_Or<False>::value, "");
-  static_assert(cuda::std::_Or<False, True>::value, "");
-  static_assert(cuda::std::_Or<True, cuda::std::_Not<BOOM<3>>>::value, "");
-  static_assert(!cuda::std::_Or<False, False>::value, "");
-  static_assert(cuda::std::_Or<True, BOOM<1>>::value, "");
-  static_assert(cuda::std::_Or<False, False, False, False, True, BOOM<2>>::value, "");
-}
-
-__host__ __device__ void test_combined()
-{
-  static_assert(cuda::std::_And<True, cuda::std::_Or<False, True, BOOM<4>>>::value, "");
-  static_assert(cuda::std::_And<True, cuda::std::_Or<False, True, BOOM<4>>>::value, "");
-  static_assert(cuda::std::_Not<cuda::std::_And<True, False, BOOM<5>>>::value, "");
-}
-
-struct MemberTest
-{
-  static const int foo;
-  using type = long;
-
-  __host__ __device__ void func(int);
-};
-struct Empty
-{};
-struct MemberTest2
-{
-  using foo = int;
-};
-template <class T>
-using HasFooData = decltype(T::foo);
-template <class T>
-using HasFooType = typename T::foo;
-
-template <class T, class U>
-using FuncCallable = decltype(cuda::std::declval<T>().func(cuda::std::declval<U>()));
-template <class T>
-using BadCheck = typename T::DOES_NOT_EXIST;
-
-__host__ __device__ void test_is_valid_trait()
-{
-  static_assert(cuda::std::_IsValidExpansion<HasFooData, MemberTest>::value, "");
-  static_assert(!cuda::std::_IsValidExpansion<HasFooType, MemberTest>::value, "");
-  static_assert(!cuda::std::_IsValidExpansion<HasFooData, MemberTest2>::value, "");
-  static_assert(cuda::std::_IsValidExpansion<HasFooType, MemberTest2>::value, "");
-  static_assert(cuda::std::_IsValidExpansion<FuncCallable, MemberTest, int>::value, "");
-  static_assert(!cuda::std::_IsValidExpansion<FuncCallable, MemberTest, void*>::value, "");
-}
-
-__host__ __device__ void test_first_and_second_type()
-{
-  ASSERT_SAME_TYPE(cuda::std::_FirstType<int, long, void*>, int);
-  ASSERT_SAME_TYPE(cuda::std::_FirstType<char>, char);
-  ASSERT_SAME_TYPE(cuda::std::_SecondType<char, long>, long);
-  ASSERT_SAME_TYPE(cuda::std::_SecondType<long long, int, void*>, int);
-}
-
-int main(int, char**)
-{
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp
new file mode 100644
index 0000000000..6b0296db2b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp
@@ -0,0 +1,134 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  using vec = cuda::std::inplace_vector<T, 42>;
+  vec range{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+  const vec const_range{T(0), T(42), T(1337), T(42), T(5), T(-42)};
+
+  auto&& bracket = range[3];
+  static_assert(cuda::std::is_same<decltype(bracket), typename vec::reference>::value, "");
+  assert(bracket == T(12));
+
+  range[3]              = T(4);
+  auto&& bracket_assign = range[3];
+  static_assert(cuda::std::is_same<decltype(bracket_assign), typename vec::reference>::value, "");
+  assert(bracket_assign == T(4));
+
+  auto&& const_bracket = const_range[3];
+  static_assert(cuda::std::is_same<decltype(const_bracket), typename vec::const_reference>::value, "");
+  assert(const_bracket == T(42));
+
+  auto&& front = range.front();
+  static_assert(cuda::std::is_same<decltype(front), typename vec::reference>::value, "");
+  assert(front == T(1));
+
+  auto&& const_front = const_range.front();
+  static_assert(cuda::std::is_same<decltype(const_front), typename vec::const_reference>::value, "");
+  assert(const_front == T(0));
+
+  auto&& back = range.back();
+  static_assert(cuda::std::is_same<decltype(back), typename vec::reference>::value, "");
+  assert(back == T(-1));
+
+  auto&& const_back = const_range.back();
+  static_assert(cuda::std::is_same<decltype(const_back), typename vec::const_reference>::value, "");
+  assert(const_back == -42);
+
+  auto data = range.data();
+  static_assert(cuda::std::is_same<decltype(data), typename vec::pointer>::value, "");
+  assert(*data == T(1));
+  assert(data == cuda::std::addressof(front));
+
+  auto const_data = const_range.data();
+  static_assert(cuda::std::is_same<decltype(const_data), typename vec::const_pointer>::value, "");
+  assert(*const_data == T(0));
+  assert(const_data == cuda::std::addressof(const_front));
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+  }
+
+  return true;
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+void test_exceptions()
+{ // at throws std::out_of_range
+  {
+    using vec = cuda::std::inplace_vector<int, 42>;
+    try
+    {
+      vec too_small{};
+      auto res = too_small.at(5);
+      unused(res);
+    }
+    catch (const std::out_of_range&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      const vec too_small{};
+      auto res = too_small.at(5);
+      unused(res);
+    }
+    catch (const std::out_of_range&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+  }
+}
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  NV_IF_TARGET(NV_IS_HOST, (test_exceptions();))
+#endif // !TEST_HAS_NO_EXCEPTIONS
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp
new file mode 100644
index 0000000000..83500da6d0
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp
@@ -0,0 +1,345 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+_CCCL_DIAG_SUPPRESS_GCC("-Wmissing-braces")
+_CCCL_DIAG_SUPPRESS_CLANG("-Wmissing-braces")
+_CCCL_DIAG_SUPPRESS_MSVC(5246)
+
+#if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+template <class T, template <class, size_t> class Range>
+__host__ __device__ constexpr void test_ranges()
+{
+  { // inplace_vector<T, 0>::assign_range with an empty input
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.assign_range(Range<T, 0>{});
+    assert(vec.empty());
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N>::assign_range with an empty input
+    inplace_vector vec{};
+    vec.assign_range(Range<T, 0>{});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign_range with an empty input, shrinking
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign_range(Range<T, 0>{});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign_range with a non-empty input, shrinking
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign_range(Range<T, 2>{T(42), T(42)});
+    assert(!vec.empty());
+    assert(equal_range(vec, cuda::std::array<T, 2>{T(42), T(42)}));
+  }
+
+  { // inplace_vector<T, N>::assign_range with a non-empty input, growing
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign_range(Range<T, 6>{T(42), T(1), T(42), T(1337), T(0), T(42)});
+    assert(!vec.empty());
+    assert(equal_range(vec, cuda::std::array<T, 6>{T(42), T(1), T(42), T(1337), T(0), T(42)}));
+  }
+}
+#endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  { // inplace_vector<T, 0>::assign(count, const T&)
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.assign(0, T(42));
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, 0>::assign(iter, iter), with input iterators
+    using iter = cpp17_input_iterator<const T*>;
+    cuda::std::initializer_list<T> input{};
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.assign(iter{input.begin()}, iter{input.end()});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, 0>::assign(iter, iter), with forward iterators
+    cuda::std::initializer_list<T> input{};
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.assign(input.begin(), input.end());
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, 0>::assign(initializer_list)
+    cuda::std::initializer_list<T> input{};
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.assign(input);
+    assert(vec.empty());
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N>::assign(count, const T&), zero count from empty
+    inplace_vector vec{};
+    vec.assign(0, T(42));
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign(count, const T&), shrinking to empty
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(0, T(42));
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign(count, const T&), shrinking
+    const cuda::std::array<T, 2> expected = {T(42), T(42)};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(2, T(42));
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N>::assign(count, const T&), growing
+    const cuda::std::array<T, 6> expected = {T(42), T(42), T(42), T(42), T(42), T(42)};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(6, T(42));
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N>::assign(iter, iter), with input iterators empty range
+    using iter                            = cpp17_input_iterator<const T*>;
+    const cuda::std::array<T, 0> expected = {};
+    inplace_vector vec{};
+    vec.assign(iter{expected.begin()}, iter{expected.end()});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign(iter, iter), with input iterators shrink to empty range
+    using iter                            = cpp17_input_iterator<const T*>;
+    const cuda::std::array<T, 0> expected = {};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(iter{expected.begin()}, iter{expected.end()});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign(iter, iter), with input iterators shrinking
+    using iter                            = cpp17_input_iterator<const T*>;
+    const cuda::std::array<T, 2> expected = {T(42), T(42)};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(iter{expected.begin()}, iter{expected.end()});
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N>::assign(iter, iter), with input iterators growing
+    using iter                            = cpp17_input_iterator<const T*>;
+    const cuda::std::array<T, 6> expected = {T(42), T(1), T(42), T(1337), T(0), T(42)};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(iter{expected.begin()}, iter{expected.end()});
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N>::assign(iter, iter), with forward iterators empty range
+    const cuda::std::array<T, 0> expected = {};
+    inplace_vector vec{};
+    vec.assign(expected.begin(), expected.end());
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign(iter, iter), with forward iterators shrinking to empty
+    const cuda::std::array<T, 0> expected = {};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(expected.begin(), expected.end());
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign(iter, iter), with forward iterators shrinking
+    const cuda::std::array<T, 2> expected = {T(42), T(42)};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(expected.begin(), expected.end());
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N>::assign(iter, iter), with forward iterators growing
+    const cuda::std::array<T, 6> expected = {T(42), T(1), T(42), T(1337), T(0), T(42)};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(expected.begin(), expected.end());
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N>::assign(initializer_list), empty range
+    cuda::std::initializer_list<T> expected = {};
+    inplace_vector vec{};
+    vec.assign(expected);
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign(initializer_list), shrinking to empty
+    cuda::std::initializer_list<T> expected = {};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign(expected);
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::assign(initializer_list), shrinking
+    cuda::std::array<T, 2> expected{T(42), T(42)};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign({T(42), T(42)});
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N>::assign(initializer_list), growing
+    cuda::std::array<T, 6> expected{T(42), T(1), T(42), T(1337), T(0), T(42)};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec.assign({T(42), T(1), T(42), T(1337), T(0), T(42)});
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+#if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+  test_ranges<T, input_range>();
+  test_ranges<T, uncommon_range>();
+  test_ranges<T, sized_uncommon_range>();
+  test_ranges<T, cuda::std::array>();
+#endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+  }
+
+  return true;
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+template <template <class, size_t> class Range>
+void test_exceptions()
+{ // assign_range throws std::bad_alloc
+  constexpr size_t capacity = 4;
+  using inplace_vector      = cuda::std::inplace_vector<int, capacity>;
+  inplace_vector too_small{};
+  try
+  {
+    too_small.assign_range(Range<int, 2 + capacity>{0, 1, 2, 3, 4, 5});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+}
+#  endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+
+void test_exceptions()
+{ // assign throws std::bad_alloc
+  constexpr size_t capacity = 4;
+  using inplace_vector      = cuda::std::inplace_vector<int, capacity>;
+  inplace_vector too_small{};
+  const cuda::std::array<int, 7> input{0, 1, 2, 3, 4, 5, 6};
+
+  try
+  {
+    too_small.assign(2 * capacity, 42);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    using iter = cpp17_input_iterator<const int*>;
+    too_small.assign(iter{input.begin()}, iter{input.end()});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.assign(input.begin(), input.end());
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.assign(cuda::std::initializer_list<int>{0, 1, 2, 3, 4, 5, 6});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+#  if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+  test_exceptions<input_range>();
+  test_exceptions<uncommon_range>();
+  test_exceptions<sized_uncommon_range>();
+  test_exceptions<cuda::std::array>();
+#  endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+}
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  NV_IF_TARGET(NV_IS_HOST, (test_exceptions();))
+#endif // !TEST_HAS_NO_EXCEPTIONS
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp
new file mode 100644
index 0000000000..fb89798719
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp
@@ -0,0 +1,275 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/type_traits>
+
+#include "MoveOnly.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+template <class T>
+__host__ __device__ constexpr void test_copy()
+{
+  // Zero capacity inplace_vector is nothrow_copy_assignable
+  static_assert(cuda::std::is_nothrow_copy_assignable<cuda::std::inplace_vector<T, 0>>::value, "");
+  static_assert(cuda::std::is_nothrow_copy_assignable<cuda::std::inplace_vector<T, 42>>::value
+                  == cuda::std::conjunction<cuda::std::is_nothrow_copy_constructible<T>,
+                                            cuda::std::is_nothrow_copy_assignable<T>>::value,
+                "");
+
+  { // inplace_vector<T, 0> can be copy assigned
+    const cuda::std::inplace_vector<T, 0> input{};
+    cuda::std::inplace_vector<T, 0> no_capacity{};
+    no_capacity = input;
+    assert(no_capacity.empty());
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N> can be copy assigned an empty input
+    const inplace_vector input{};
+    inplace_vector vec{};
+    vec = input;
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N> can be copy assigned an empty input, shrinking
+    const inplace_vector input{};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec = input;
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N> can be copy assigned a non-empty input, growing from empty
+    const inplace_vector input{T(1), T(42), T(1337), T(0)};
+    inplace_vector vec{};
+    vec = input;
+    assert(!vec.empty());
+    assert(equal_range(vec, input));
+  }
+
+  { // inplace_vector<T, N> can be copy assigned a non-empty input, shrinking
+    const inplace_vector input{T(1), T(42), T(1337), T(0)};
+    inplace_vector vec{T(0), T(42), T(1337), T(42), T(5)};
+    vec = input;
+    assert(!vec.empty());
+    assert(equal_range(vec, input));
+  }
+
+  { // inplace_vector<T, N> can be copy assigned a non-empty input, growing
+    const inplace_vector input{T(1), T(42), T(1337), T(0)};
+    inplace_vector vec{T(0), T(42)};
+    vec = input;
+    assert(!vec.empty());
+    assert(equal_range(vec, input));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_move()
+{
+  // Zero capacity inplace_vector is nothrow_move_assignable
+  static_assert(cuda::std::is_nothrow_move_assignable<cuda::std::inplace_vector<T, 0>>::value, "");
+  static_assert(cuda::std::is_nothrow_move_assignable<cuda::std::inplace_vector<T, 42>>::value
+                  == cuda::std::conjunction<cuda::std::is_nothrow_move_constructible<T>,
+                                            cuda::std::is_nothrow_move_assignable<T>>::value,
+                "");
+
+  { // inplace_vector<T, 0> can be move assigned
+    cuda::std::inplace_vector<T, 0> input{};
+    cuda::std::inplace_vector<T, 0> no_capacity{};
+    no_capacity = cuda::std::move(input);
+    assert(no_capacity.empty());
+    assert(input.empty());
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N> can be move assigned an empty input
+    inplace_vector input{};
+    inplace_vector vec{};
+    vec = cuda::std::move(input);
+    assert(vec.empty());
+    assert(input.empty());
+  }
+
+  { // inplace_vector<T, N> can be move assigned an empty input, shrinking
+    inplace_vector input{};
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec = cuda::std::move(input);
+    assert(vec.empty());
+    assert(input.empty());
+  }
+
+  const cuda::std::array<T, 4> expected{T(1), T(42), T(1337), T(0)};
+  { // inplace_vector<T, N> can be move assigned a non-empty input, growing from empty
+    inplace_vector input{T(1), T(42), T(1337), T(0)};
+    inplace_vector vec{};
+    vec = cuda::std::move(input);
+    assert(!vec.empty());
+    assert(input.size() == 4);
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N> can be move assigned a non-empty input, shrinking
+    inplace_vector input{T(1), T(42), T(1337), T(0)};
+    inplace_vector vec{T(0), T(42), T(1337), T(42), T(5)};
+    vec = cuda::std::move(input);
+    assert(!vec.empty());
+    assert(input.size() == 4);
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N> can be move assigned a non-empty input, growing
+    inplace_vector input{T(1), T(42), T(1337), T(0)};
+    inplace_vector vec{T(0), T(42)};
+    vec = cuda::std::move(input);
+    assert(!vec.empty());
+    assert(input.size() == 4);
+    assert(equal_range(vec, expected));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_move_only()
+{
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  inplace_vector input;
+  input.emplace_back(1);
+  input.emplace_back(42);
+  input.emplace_back(1337);
+  input.emplace_back(0);
+  inplace_vector vec;
+  vec.emplace_back(-2);
+  vec.emplace_back(-2);
+  vec.emplace_back(-2);
+  vec = cuda::std::move(input);
+  assert(!vec.empty());
+  assert(input.size() == 4);
+  assert(equal_range(vec, cuda::std::array<T, 4>{T(1), T(42), T(1337), T(0)}));
+}
+
+template <class T>
+__host__ __device__ constexpr void test_init_list()
+{
+  { // inplace_vector<T, 0> can be assigned an empty initializer_list
+    cuda::std::initializer_list<T> input{};
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec = input;
+    assert(vec.empty());
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  cuda::std::initializer_list<T> empty_input{};
+  { // inplace_vector<T, N> can be assigned an empty initializer_list
+    inplace_vector vec{};
+    vec = empty_input;
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N> can be assigned an empty initializer_list, shrinking
+    inplace_vector vec{T(1), T(42), T(1337), T(0)};
+    vec = empty_input;
+    assert(vec.empty());
+  }
+
+  cuda::std::array<T, 4> expected{T(1), T(42), T(1337), T(0)};
+  { // inplace_vector<T, N> can be assigned a non-empty initializer_list, from empty
+    inplace_vector vec{};
+    vec = {T(1), T(42), T(1337), T(0)};
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N> can be assigned a non-empty initializer_list, shrinking
+    inplace_vector vec{T(0), T(42), T(1337), T(42), T(5)};
+    vec = {T(1), T(42), T(1337), T(0)};
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+
+  { // inplace_vector<T, N> can be assigned a non-empty initializer_list, growing from non empty
+    inplace_vector vec{T(0), T(42)};
+    vec = {T(1), T(42), T(1337), T(0)};
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  test_copy<T>();
+  test_move<T>();
+  test_init_list<T>();
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+    test<ThrowingCopyConstructor>();
+    test<ThrowingMoveConstructor>();
+    test<ThrowingCopyAssignment>();
+    test<ThrowingMoveAssignment>();
+    test_move_only<MoveOnly>();
+  }
+
+  return true;
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+void test_exceptions()
+{ // assignment throws std::bad_alloc
+  constexpr size_t capacity = 4;
+  using inplace_vector      = cuda::std::inplace_vector<int, capacity>;
+  inplace_vector too_small{};
+
+  try
+  {
+    cuda::std::initializer_list<int> input{0, 1, 2, 3, 4, 5, 6};
+    too_small = input;
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+}
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  NV_IF_TARGET(NV_IS_HOST, (test_exceptions();))
+#endif // !TEST_HAS_NO_EXCEPTIONS
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp
new file mode 100644
index 0000000000..fc1354c6f5
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp
@@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  constexpr size_t max_capacity = 42ull;
+  using inplace_vector          = cuda::std::inplace_vector<T, max_capacity>;
+  inplace_vector range{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+  const inplace_vector const_range{T(0), T(42), T(1337), T(42), T(5), T(-42)};
+
+  const auto empty = range.empty();
+  static_assert(cuda::std::is_same<decltype(empty), const bool>::value, "");
+  assert(!empty);
+
+  const auto const_empty = const_range.empty();
+  static_assert(cuda::std::is_same<decltype(const_empty), const bool>::value, "");
+  assert(!const_empty);
+
+  const auto size = range.size();
+  static_assert(cuda::std::is_same<decltype(size), const typename inplace_vector::size_type>::value, "");
+  assert(size == 6);
+
+  const auto const_size = const_range.size();
+  static_assert(cuda::std::is_same<decltype(const_size), const typename inplace_vector::size_type>::value, "");
+  assert(const_size == 6);
+
+  const auto max_size = range.max_size();
+  static_assert(cuda::std::is_same<decltype(max_size), const typename inplace_vector::size_type>::value, "");
+  assert(max_size == max_capacity);
+
+  const auto const_max_size = const_range.max_size();
+  static_assert(cuda::std::is_same<decltype(const_max_size), const typename inplace_vector::size_type>::value, "");
+  assert(const_max_size == max_capacity);
+
+  const auto capacity = range.capacity();
+  static_assert(cuda::std::is_same<decltype(capacity), const typename inplace_vector::size_type>::value, "");
+  assert(capacity == max_capacity);
+
+  const auto const_capacity = const_range.capacity();
+  static_assert(cuda::std::is_same<decltype(const_capacity), const typename inplace_vector::size_type>::value, "");
+  assert(const_capacity == max_capacity);
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp
new file mode 100644
index 0000000000..e926217fc3
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+template <class T>
+__host__ __device__ constexpr void test_equality()
+{
+  using inplace_vector = cuda::std::inplace_vector<T, 42ull>;
+
+  inplace_vector vec{T(1), T(42), T(1337), T(0)};
+  inplace_vector other_vec{T(0), T(1), T(2), T(3), T(4)};
+
+  auto res_equality = vec == vec;
+  static_assert(cuda::std::is_same<decltype(res_equality), bool>::value, "");
+  assert(res_equality);
+
+  auto res_inequality = vec != other_vec;
+  static_assert(cuda::std::is_same<decltype(res_inequality), bool>::value, "");
+  assert(res_inequality);
+}
+
+template <class T>
+__host__ __device__ constexpr void test_relation()
+{
+  using inplace_vector = cuda::std::inplace_vector<T, 42ull>;
+
+  inplace_vector vec{T(0), T(1), T(1), T(3), T(4)};
+  inplace_vector other_vec{T(0), T(1), T(2), T(3), T(4)};
+
+  auto res_less = vec < other_vec;
+  static_assert(cuda::std::is_same<decltype(res_less), bool>::value, "");
+  assert(res_less);
+
+  auto res_less_equal = vec <= other_vec;
+  static_assert(cuda::std::is_same<decltype(res_less_equal), bool>::value, "");
+  assert(res_less_equal);
+
+  auto res_greater = vec > other_vec;
+  static_assert(cuda::std::is_same<decltype(res_greater), bool>::value, "");
+  assert(!res_greater);
+
+  auto res_greater_equal = vec >= other_vec;
+  static_assert(cuda::std::is_same<decltype(res_greater_equal), bool>::value, "");
+  assert(!res_greater_equal);
+}
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  test_equality<T>();
+  test_relation<T>();
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp
new file mode 100644
index 0000000000..87e271931c
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp
@@ -0,0 +1,456 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/type_traits>
+
+#include "cuda/std/__type_traits/is_nothrow_default_constructible.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+_CCCL_DIAG_SUPPRESS_GCC("-Wmissing-braces")
+_CCCL_DIAG_SUPPRESS_CLANG("-Wmissing-braces")
+_CCCL_DIAG_SUPPRESS_MSVC(5246)
+
+template <class T>
+__host__ __device__ constexpr void test_default()
+{
+  { // inplace_vecto<T, 0> is default_constructible
+    cuda::std::inplace_vector<T, 0> vec{};
+    assert(vec.empty());
+    static_assert(cuda::std::is_nothrow_default_constructible<cuda::std::inplace_vector<T, 0>>::value, "");
+  }
+
+  { // inplace_vecto<T, N> is default_constructible
+    cuda::std::inplace_vector<T, 42> vec{};
+    assert(vec.empty());
+    static_assert(cuda::std::is_nothrow_default_constructible<cuda::std::inplace_vector<T, 42>>::value, "");
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_copy_move()
+{
+  // Zero capacity inplace_vector is trivial
+  static_assert(cuda::std::is_nothrow_copy_constructible<cuda::std::inplace_vector<T, 0>>::value, "");
+  static_assert(cuda::std::is_nothrow_move_constructible<cuda::std::inplace_vector<T, 0>>::value, "");
+  static_assert(cuda::std::is_nothrow_copy_constructible<cuda::std::inplace_vector<T, 42>>::value
+                  == cuda::std::is_nothrow_copy_constructible<T>::value,
+                "");
+  static_assert(cuda::std::is_nothrow_move_constructible<cuda::std::inplace_vector<T, 42>>::value
+                  == cuda::std::is_nothrow_move_constructible<T>::value,
+                "");
+  { // inplace_vector<T, 0> can be copy constructed
+    cuda::std::inplace_vector<T, 0> input{};
+    cuda::std::inplace_vector<T, 0> vec(input);
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, 0> can be move constructed
+    cuda::std::inplace_vector<T, 0> input{};
+    cuda::std::inplace_vector<T, 0> vec(cuda::std::move(input));
+    assert(input.empty());
+    assert(vec.empty());
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N> can be copy constructed from empty input
+    const inplace_vector input{};
+    inplace_vector vec(input);
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N> can be move constructed with empty input
+    inplace_vector input{};
+    inplace_vector vec(cuda::std::move(input));
+    assert(vec.empty());
+    assert(input.empty());
+  }
+
+  { // inplace_vector<T, N> can be copy constructed from non-empty input
+    inplace_vector input{T(1), T(42), T(1337), T(0)};
+    inplace_vector vec(input);
+    assert(!vec.empty());
+    assert(equal_range(vec, input));
+  }
+
+  { // inplace_vector<T, N> can be move constructed from non-empty input
+    inplace_vector input{T(1), T(42), T(1337), T(0)};
+    inplace_vector vec(cuda::std::move(input));
+    assert(!vec.empty());
+    assert(input.size() == 4);
+    assert(equal_range(vec, cuda::std::array<T, 4>{T(1), T(42), T(1337), T(0)}));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_size()
+{
+  { // inplace_vector<T, 0> can be constructed from a size
+    cuda::std::inplace_vector<T, 0> vec(0);
+    assert(vec.empty());
+#if (!defined(TEST_COMPILER_GCC) || __GNUC__ >= 10) && !defined(TEST_COMPILER_MSVC)
+    static_assert(!noexcept(cuda::std::inplace_vector<T, 0>(0)), "");
+#endif // !TEST_COMPILER_GCC < 10 && !TEST_COMPILER_MSVC
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N> can be constructed from a size, is empty if zero
+    inplace_vector vec(0);
+    assert(vec.empty());
+#if (!defined(TEST_COMPILER_GCC) || __GNUC__ >= 10) && !defined(TEST_COMPILER_MSVC)
+    static_assert(!noexcept(inplace_vector(0)), "");
+#endif // !TEST_COMPILER_GCC < 10 && !TEST_COMPILER_MSVC
+  }
+
+  { // inplace_vector<T, N> can be constructed from a size, elements are value initialized
+    constexpr size_t size{3};
+    inplace_vector vec(size);
+    assert(!vec.empty());
+    assert(equal_range(vec, cuda::std::array<T, size>{T(0), T(0), T(0)}));
+#if (!defined(TEST_COMPILER_GCC) || __GNUC__ >= 10) && !defined(TEST_COMPILER_MSVC)
+    static_assert(!noexcept(inplace_vector(3)), "");
+#endif // !TEST_COMPILER_GCC < 10 && !TEST_COMPILER_MSVC
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_size_value()
+{
+  { // inplace_vector<T, 0> can be constructed from a size and a const T&
+    cuda::std::inplace_vector<T, 0> vec(0, T(42));
+    assert(vec.empty());
+#if (!defined(TEST_COMPILER_GCC) || __GNUC__ >= 10) && !defined(TEST_COMPILER_MSVC)
+    static_assert(!noexcept(cuda::std::inplace_vector<T, 0>(0, T(42))), "");
+#endif // !TEST_COMPILER_GCC < 10 && !TEST_COMPILER_MSVC
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N> can be constructed from a size and a const T&, is empty if zero
+    inplace_vector vec(0, T(42));
+    assert(vec.empty());
+#if (!defined(TEST_COMPILER_GCC) || __GNUC__ >= 10) && !defined(TEST_COMPILER_MSVC)
+    static_assert(!noexcept(inplace_vector(0, T(42))), "");
+#endif // !TEST_COMPILER_GCC < 10 && !TEST_COMPILER_MSVC
+  }
+
+  { // inplace_vector<T, N> can be constructed from a size and a const T&, elements are copied
+    constexpr size_t size{3};
+    inplace_vector vec(size, T(42));
+    assert(!vec.empty());
+    assert(equal_range(vec, cuda::std::array<T, size>{T(42), T(42), T(42)}));
+#if (!defined(TEST_COMPILER_GCC) || __GNUC__ >= 10) && !defined(TEST_COMPILER_MSVC)
+    static_assert(!noexcept(inplace_vector(3, T(42))), "");
+#endif // !TEST_COMPILER_GCC < 10 && !TEST_COMPILER_MSVC
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_iter()
+{
+  const cuda::std::array<T, 4> input{T(1), T(42), T(1337), T(0)};
+  { // inplace_vector<T, 0> can be constructed from two equal input iterators
+    using iter = cpp17_input_iterator<const T*>;
+    cuda::std::inplace_vector<T, 0> vec(iter{input.begin()}, iter{input.begin()});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, 0> can be constructed from two equal forward iterators
+    using iter = forward_iterator<const T*>;
+    cuda::std::inplace_vector<T, 0> vec(iter{input.begin()}, iter{input.begin()});
+    assert(vec.empty());
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N> can be constructed from two equal input iterators
+    using iter = cpp17_input_iterator<const T*>;
+    inplace_vector vec(iter{input.begin()}, iter{input.begin()});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N> can be constructed from two equal forward iterators
+    using iter = forward_iterator<const T*>;
+    inplace_vector vec(iter{input.begin()}, iter{input.begin()});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N> can be constructed from two input iterators
+    using iter = cpp17_input_iterator<const T*>;
+    inplace_vector vec(iter{input.begin()}, iter{input.end()});
+    assert(!vec.empty());
+    assert(equal_range(vec, input));
+  }
+
+  { // inplace_vector<T, N> can be constructed from two forward iterators
+    using iter = forward_iterator<const T*>;
+    inplace_vector vec(iter{input.begin()}, iter{input.end()});
+    assert(!vec.empty());
+    assert(equal_range(vec, input));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_init_list()
+{
+  { // inplace_vector<T, 0> can be constructed from an empty initializer_list
+    cuda::std::initializer_list<T> input{};
+    cuda::std::inplace_vector<T, 0> vec(input);
+    assert(vec.empty());
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N> can be constructed from an empty initializer_list
+    cuda::std::initializer_list<T> input{};
+    inplace_vector vec(input);
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N> can be constructed from a non-empty initializer_list
+    cuda::std::array<T, 4> expected{T(1), T(42), T(1337), T(0)};
+    inplace_vector vec({T(1), T(42), T(1337), T(0)});
+    assert(!vec.empty());
+    assert(equal_range(vec, expected));
+  }
+}
+
+#if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC)
+template <class T, template <class, size_t> class Range>
+__host__ __device__ constexpr void test_range()
+{
+  { // inplace_vector<T, 0> can be constructed from an empty range
+    cuda::std::inplace_vector<T, 0> vec(cuda::std::from_range, Range<T, 0>{});
+    assert(vec.empty());
+  }
+
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  { // inplace_vector<T, N> can be constructed from an empty range
+    inplace_vector vec(cuda::std::from_range, Range<T, 0>{});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N> can be constructed from a non-empty range
+    inplace_vector vec(cuda::std::from_range, Range<T, 4>{T(1), T(42), T(1337), T(0)});
+    assert(!vec.empty());
+    assert(equal_range(vec, cuda::std::array<T, 4>{T(1), T(42), T(1337), T(0)}));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_range()
+{
+#  if !defined(TEST_COMPILER_GCC) || __GNUC__ >= 8
+  test_range<T, input_range>();
+  test_range<T, uncommon_range>();
+  test_range<T, sized_uncommon_range>();
+#  endif // !TEST_COMPILER_GCC < 8
+  test_range<T, cuda::std::array>();
+}
+#endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC)
+
+template <class T, cuda::std::enable_if_t<cuda::std::is_trivial<T>::value, int> = 0>
+__host__ __device__ constexpr void test()
+{
+  test_default<T>();
+  test_copy_move<T>();
+  test_size<T>();
+  test_size_value<T>();
+  test_iter<T>();
+  test_init_list<T>();
+#if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC)
+  test_range<T>();
+#endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC)
+}
+
+template <class T, cuda::std::enable_if_t<!cuda::std::is_trivial<T>::value, int> = 0>
+__host__ __device__ constexpr void test()
+{
+  test_default<T>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test_copy_move<T>();
+    test_size<T>();
+    test_size_value<T>();
+    test_iter<T>();
+    test_init_list<T>();
+#if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC)
+    test_range<T>();
+#endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC)
+  }
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+  test<NonTrivial>();
+  test<ThrowingDefaultConstruct>();
+  test<ThrowingCopyConstructor>();
+  test<ThrowingMoveConstructor>();
+  test<ThrowingCopyAssignment>();
+  test<ThrowingMoveAssignment>();
+
+  // Due to reinterpret_cast within the destructor a on trivially destructible type cannot be constexpr at all
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivialDestructor>();
+  }
+
+  return true;
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+void test_exceptions()
+{ // constructors throw std::bad_alloc
+  constexpr size_t capacity = 4;
+  using inplace_vector      = cuda::std::inplace_vector<int, capacity>;
+
+  try
+  {
+    inplace_vector too_small(2 * capacity);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    inplace_vector too_small(2 * capacity, 42);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    using iter = cpp17_input_iterator<const int*>;
+    cuda::std::array<int, 2 * capacity> input{0, 1, 2, 3, 4, 5, 6, 7};
+    inplace_vector too_small(iter{input.begin()}, iter{input.end()});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    cuda::std::array<int, 2 * capacity> input{0, 1, 2, 3, 4, 5, 6, 7};
+    inplace_vector too_small(input.begin(), input.end());
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    cuda::std::initializer_list<int> input{0, 1, 2, 3, 4, 5, 6};
+    inplace_vector too_small(input);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+#  if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC)
+  try
+  {
+    input_range<int, 2 * capacity> input{{0, 1, 2, 3, 4, 5, 6, 7}};
+    inplace_vector too_small(cuda::std::from_range, input);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    uncommon_range<int, 2 * capacity> input{{0, 1, 2, 3, 4, 5, 6, 7}};
+    inplace_vector too_small(cuda::std::from_range, input);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    sized_uncommon_range<int, 2 * capacity> input{{0, 1, 2, 3, 4, 5, 6, 7}};
+    inplace_vector too_small(cuda::std::from_range, input);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    cuda::std::array<int, 2 * capacity> input{0, 1, 2, 3, 4, 5, 6, 7};
+    inplace_vector too_small(cuda::std::from_range, input);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+#  endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC)
+}
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  NV_IF_TARGET(NV_IS_HOST, (test_exceptions();))
+#endif // !TEST_HAS_NO_EXCEPTIONS
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp
new file mode 100644
index 0000000000..b3f226876c
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp
@@ -0,0 +1,314 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  constexpr size_t max_capacity         = 6ull;
+  using inplace_vector                  = cuda::std::inplace_vector<T, max_capacity>;
+  const cuda::std::array<T, 6> expected = {T(0), T(1), T(2), T(3), T(4), T(5)};
+
+  { // inplace_vector<T, N>::emplace(iter, args...)
+    inplace_vector vec = {T(0), T(1), T(2), T(4), T(5)};
+    const auto res     = vec.emplace(vec.begin() + 3, 3);
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == vec.begin() + 3);
+  }
+
+  { // inplace_vector<T, N>::emplace(const_iter, args...)
+    inplace_vector vec = {T(0), T(1), T(2), T(4), T(5)};
+    const auto res     = vec.emplace(vec.cbegin() + 3, 3);
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == vec.cbegin() + 3);
+  }
+
+  { // inplace_vector<T, N>::emplace_back(args...)
+    inplace_vector vec = {T(0), T(1), T(2), T(3), T(4)};
+    auto&& res         = vec.emplace_back(5);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::reference>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == T(5));
+    res = T(6);
+    assert(res == T(6));
+  }
+
+  { // inplace_vector<T, N>::push_back(const T&)
+    const T to_be_pushed = 5;
+    inplace_vector vec   = {T(0), T(1), T(2), T(3), T(4)};
+    auto&& res           = vec.push_back(to_be_pushed);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::reference>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == T(5));
+    res = T(6);
+    assert(res == T(6));
+  }
+
+  { // inplace_vector<T, N>::push_back(T&&)
+    inplace_vector vec = {T(0), T(1), T(2), T(3), T(4)};
+    auto&& res         = vec.push_back(T(5));
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::reference>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == T(5));
+    res = T(6);
+    assert(res == T(6));
+  }
+
+  { // inplace_vector<T, 0>::try_emplace_back(args...)
+    cuda::std::inplace_vector<T, 0> vec{};
+    auto res = vec.try_emplace_back(5);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::pointer>::value, "");
+    assert(vec.empty());
+    assert(res == nullptr);
+  }
+
+  { // inplace_vector<T, N>::try_emplace_back(args...)
+    inplace_vector vec = {T(0), T(1), T(2), T(3), T(4)};
+    auto res           = vec.try_emplace_back(5);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::pointer>::value, "");
+    assert(equal_range(vec, expected));
+    assert(*res == T(5));
+    *res = T(6);
+    assert(*res == T(6));
+  }
+
+  { // inplace_vector<T, N>::try_emplace_back(args...), at capacity
+    inplace_vector vec = {T(0), T(1), T(2), T(3), T(4), T(5)};
+    auto res           = vec.try_emplace_back(6);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::pointer>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == nullptr);
+  }
+
+  { // inplace_vector<T, 0>::try_push_back(const T&)
+    const T to_be_pushed = 5;
+    cuda::std::inplace_vector<T, 0> vec{};
+    auto res = vec.try_push_back(to_be_pushed);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::pointer>::value, "");
+    assert(vec.empty());
+    assert(res == nullptr);
+  }
+
+  { // inplace_vector<T, N>::try_push_back(const T&)
+    const T to_be_pushed = 5;
+    inplace_vector vec   = {T(0), T(1), T(2), T(3), T(4)};
+    auto res             = vec.try_push_back(to_be_pushed);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::pointer>::value, "");
+    assert(equal_range(vec, expected));
+    assert(*res == T(5));
+    *res = T(6);
+    assert(*res == T(6));
+  }
+
+  { // inplace_vector<T, N>::try_push_back(const T&), at capacity
+    const T to_be_pushed = 6;
+    inplace_vector vec   = {T(0), T(1), T(2), T(3), T(4), T(5)};
+    auto res             = vec.try_push_back(to_be_pushed);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::pointer>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == nullptr);
+  }
+
+  { // inplace_vector<T, 0>::try_push_back(T&&)
+    cuda::std::inplace_vector<T, 0> vec{};
+    auto res = vec.try_push_back(5);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::pointer>::value, "");
+    assert(vec.empty());
+    assert(res == nullptr);
+  }
+
+  { // inplace_vector<T, N>::try_push_back(T&&)
+    inplace_vector vec = {T(0), T(1), T(2), T(3), T(4)};
+    auto res           = vec.try_push_back(T(5));
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::pointer>::value, "");
+    assert(equal_range(vec, expected));
+    assert(*res == T(5));
+    *res = T(6);
+    assert(*res == T(6));
+  }
+
+  { // inplace_vector<T, N>::try_push_back(T&&), at capacity
+    inplace_vector vec = {T(0), T(1), T(2), T(3), T(4), T(5)};
+    auto res           = vec.try_push_back(T(6));
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::pointer>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == nullptr);
+  }
+
+  { // inplace_vector<T, N>::unchecked_emplace_back(args...)
+    inplace_vector vec = {T(0), T(1), T(2), T(3), T(4)};
+    auto&& res         = vec.unchecked_emplace_back(5);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::reference>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == T(5));
+    res = T(6);
+    assert(res == T(6));
+  }
+
+  { // inplace_vector<T, N>::unchecked_push_back(const T&)
+    const T to_be_pushed = 5;
+    inplace_vector vec   = {T(0), T(1), T(2), T(3), T(4)};
+    auto&& res           = vec.unchecked_push_back(to_be_pushed);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::reference>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == T(5));
+    res = T(6);
+    assert(res == T(6));
+  }
+
+  { // inplace_vector<T, N>::unchecked_push_back(T&&)
+    inplace_vector vec = {T(0), T(1), T(2), T(3), T(4)};
+    auto&& res         = vec.unchecked_push_back(T(5));
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::reference>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == T(5));
+    res = T(6);
+    assert(res == T(6));
+  }
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+  }
+
+  return true;
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+void test_exceptions()
+{ // emplace and friends throw std::bad_alloc when out of capacity
+  using empty_vec = cuda::std::inplace_vector<int, 0>;
+  {
+    empty_vec empty{};
+    try
+    {
+      auto emplace = empty.emplace_back(5);
+      unused(emplace);
+      assert(false);
+    }
+    catch (const std::bad_alloc&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      const int input       = 5;
+      auto push_back_lvalue = empty.push_back(input);
+      unused(push_back_lvalue);
+      assert(false);
+    }
+    catch (const std::bad_alloc&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      auto push_back_rvalue = empty.push_back(5);
+      unused(push_back_rvalue);
+      assert(false);
+    }
+    catch (const std::bad_alloc&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+  }
+
+  using small_vec = cuda::std::inplace_vector<int, 5>;
+  {
+    small_vec full{0, 1, 2, 3, 4};
+    try
+    {
+      auto emplace = full.emplace_back(5);
+      unused(emplace);
+      assert(false);
+    }
+    catch (const std::bad_alloc&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      const int input       = 5;
+      auto push_back_lvalue = full.push_back(input);
+      unused(push_back_lvalue);
+      assert(false);
+    }
+    catch (const std::bad_alloc&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      auto push_back_rvalue = full.push_back(5);
+      unused(push_back_rvalue);
+      assert(false);
+    }
+    catch (const std::bad_alloc&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+  }
+}
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  NV_IF_TARGET(NV_IS_HOST, (test_exceptions();))
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp
new file mode 100644
index 0000000000..691b9c4d07
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp
@@ -0,0 +1,419 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+_CCCL_DIAG_SUPPRESS_GCC("-Wmissing-braces")
+_CCCL_DIAG_SUPPRESS_CLANG("-Wmissing-braces")
+_CCCL_DIAG_SUPPRESS_MSVC(5246)
+
+#if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+template <class T, template <class, size_t> class Range>
+__host__ __device__ constexpr void test_range()
+{
+  constexpr size_t max_capacity = 5ull;
+  using inplace_vector          = cuda::std::inplace_vector<T, max_capacity>;
+
+  { // inplace_vector<T, 0>::insert_range(iter, range)
+    cuda::std::inplace_vector<T, 0> vec{};
+    const auto res = vec.insert_range(vec.begin(), Range<T, 0>{});
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(vec.empty());
+    assert(res == vec.begin());
+  }
+
+  { // inplace_vector<T, N>::insert_range(iter, range)
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert_range(vec.begin() + 1, Range<T, 3>{T(42), T(3), T(1337)});
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, cuda::std::array<T, 5>{T(0), T(42), T(3), T(1337), T(5)}));
+    assert(res == vec.begin() + 1);
+  }
+
+  { // inplace_vector<T, 0>::insert_range(const iter, range)
+    cuda::std::inplace_vector<T, 0> vec{};
+    const auto res = vec.insert_range(vec.cbegin(), Range<T, 0>{});
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(vec.empty());
+    assert(res == vec.begin());
+  }
+
+  { // inplace_vector<T, N>::insert_range(const_iter, range)
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert_range(vec.cbegin() + 1, Range<T, 3>{T(42), T(3), T(1337)});
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, cuda::std::array<T, 5>{T(0), T(42), T(3), T(1337), T(5)}));
+    assert(res == vec.cbegin() + 1);
+  }
+
+  { // inplace_vector<T, 0>::append_range(range)
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.append_range(Range<T, 0>{});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::append_range(range)
+    inplace_vector vec = {T(0), T(5)};
+    vec.append_range(Range<T, 3>{T(42), T(3), T(1337)});
+    assert(equal_range(vec, cuda::std::array<T, 5>{T(0), T(5), T(42), T(3), T(1337)}));
+  }
+
+  { // inplace_vector<T, 0>::try_append_range(range)
+    Range<T, 3> input{T(42), T(3), T(1337)};
+    cuda::std::inplace_vector<T, 0> vec{};
+    auto res = vec.try_append_range(input);
+    static_assert(cuda::std::is_same<decltype(res), cuda::std::ranges::iterator_t<Range<T, 3>>>::value, "");
+    assert(vec.empty());
+    assert(res == input.begin());
+  }
+
+  { // inplace_vector<T, N>::try_append_range(range)
+    Range<T, 3> input{T(42), T(3), T(1337)};
+    inplace_vector vec{T(0), T(5)};
+    auto res = vec.try_append_range(input);
+    static_assert(cuda::std::is_same<decltype(res), cuda::std::ranges::iterator_t<Range<T, 3>>>::value, "");
+    assert(equal_range(vec, cuda::std::array<T, 5>{T(0), T(5), T(42), T(3), T(1337)}));
+    assert(res == input.end());
+  }
+
+  { // inplace_vector<T, N>::try_append_range(range), beyond capacity
+    Range<T, 4> input{T(42), T(3), T(1337), T(1)};
+    inplace_vector vec{T(0), T(5)};
+    auto res = vec.try_append_range(input);
+    static_assert(cuda::std::is_same<decltype(res), cuda::std::ranges::iterator_t<Range<T, 3>>>::value, "");
+    assert(equal_range(vec, cuda::std::array<T, 5>{T(0), T(5), T(42), T(3), T(1337)}));
+    assert(++res == input.end());
+  }
+}
+#endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  constexpr size_t max_capacity = 42ull;
+  using inplace_vector          = cuda::std::inplace_vector<T, max_capacity>;
+
+  { // inplace_vector<T, N>::insert(iter, const T&)
+    const T to_be_inserted = 3;
+    inplace_vector vec     = {T(0), T(5)};
+    const auto res         = vec.insert(vec.begin() + 1, to_be_inserted);
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, cuda::std::array<T, 3>{T(0), T(3), T(5)}));
+    assert(res == vec.begin() + 1);
+  }
+
+  { // inplace_vector<T, N>::insert(const_iter, const T&)
+    const T to_be_inserted = 3;
+    inplace_vector vec     = {T(0), T(5)};
+    const auto res         = vec.insert(vec.cbegin() + 1, to_be_inserted);
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, cuda::std::array<T, 3>{T(0), T(3), T(5)}));
+    assert(res == vec.begin() + 1);
+  }
+
+  { // inplace_vector<T, N>::insert(iter, T&&)
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert(vec.begin() + 1, T(3));
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, cuda::std::array<T, 3>{T(0), T(3), T(5)}));
+    assert(res == vec.begin() + 1);
+  }
+
+  { // inplace_vector<T, N>::insert(const_iter, T&&)
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert(vec.cbegin() + 1, T(3));
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, cuda::std::array<T, 3>{T(0), T(3), T(5)}));
+    assert(res == vec.begin() + 1);
+  }
+
+  const cuda::std::array<T, 5> expected{T(0), T(42), T(3), T(1337), T(5)};
+  cuda::std::initializer_list<T> input{T(42), T(3), T(1337)};
+  { // inplace_vector<T, N>::insert(iter, iter, iter), input iterators
+    using iter         = cpp17_input_iterator<const T*>;
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert(vec.begin() + 1, iter{input.begin()}, iter{input.end()});
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == vec.begin() + 1);
+  }
+
+  { // inplace_vector<T, N>::insert(const_iter, iter, iter), input iterators
+    using iter         = cpp17_input_iterator<const T*>;
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert(vec.cbegin() + 1, iter{input.begin()}, iter{input.end()});
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == vec.cbegin() + 1);
+  }
+
+  { // inplace_vector<T, N>::insert(iter, iter, iter), forward iterators
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert(vec.begin() + 1, input.begin(), input.end());
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == vec.begin() + 1);
+  }
+
+  { // inplace_vector<T, N>::insert(const_iter, iter, iter), forward iterators
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert(vec.cbegin() + 1, input.begin(), input.end());
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == vec.cbegin() + 1);
+  }
+
+  { // inplace_vector<T, N>::insert(iter, init_list)
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert(vec.begin() + 1, input);
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == vec.begin() + 1);
+  }
+
+  { // inplace_vector<T, N>::insert(const_iter, init_list)
+    inplace_vector vec = {T(0), T(5)};
+    const auto res     = vec.insert(vec.cbegin() + 1, input);
+    static_assert(cuda::std::is_same<decltype(res), const typename inplace_vector::iterator>::value, "");
+    assert(equal_range(vec, expected));
+    assert(res == vec.cbegin() + 1);
+  }
+
+#if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+  test_range<T, input_range>();
+  test_range<T, uncommon_range>();
+  test_range<T, sized_uncommon_range>();
+  test_range<T, cuda::std::array>();
+#endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+  }
+
+  return true;
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+void test_exceptions()
+{ // insert throws std::bad_alloc
+  using inplace_vector = cuda::std::inplace_vector<int, 2>;
+  inplace_vector too_small{1, 2};
+
+  try
+  {
+    const int input = 5;
+    too_small.insert(too_small.begin(), input);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.insert(too_small.begin(), 1);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.insert(too_small.begin(), 5, 42);
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    using iter = cpp17_input_iterator<const int*>;
+    cuda::std::array<int, 3> input{42, 3, 1337};
+    too_small.insert(too_small.begin(), iter{input.begin()}, iter{input.end()});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    cuda::std::array<int, 3> input{42, 3, 1337};
+    too_small.insert(too_small.begin(), input.begin(), input.end());
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.insert(too_small.begin(), {42, 3, 1337});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+#  if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+  try
+  {
+    too_small.insert_range(too_small.begin(), input_range<int, 3>{42, 3, 1337});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.insert_range(too_small.begin(), uncommon_range<int, 3>{42, 3, 1337});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.insert_range(too_small.begin(), sized_uncommon_range<int, 3>{42, 3, 1337});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.insert_range(too_small.begin(), cuda::std::array<int, 3>{42, 3, 1337});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.append_range(input_range<int, 3>{42, 3, 1337});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.append_range(uncommon_range<int, 3>{42, 3, 1337});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.append_range(sized_uncommon_range<int, 3>{42, 3, 1337});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+
+  try
+  {
+    too_small.append_range(cuda::std::array<int, 3>{42, 3, 1337});
+    assert(false);
+  }
+  catch (const std::bad_alloc&)
+  {}
+  catch (...)
+  {
+    assert(false);
+  }
+#  endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+}
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  NV_IF_TARGET(NV_IS_HOST, (test_exceptions();))
+#endif // !TEST_HAS_NO_EXCEPTIONS
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp
new file mode 100644
index 0000000000..740a4eaf05
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp
@@ -0,0 +1,106 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/iterator>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  using inplace_vector = cuda::std::inplace_vector<T, 42>;
+  inplace_vector range{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+  const inplace_vector const_range{T(0), T(42), T(1337), T(42), T(5), T(-42)};
+
+  const auto begin = range.begin();
+  static_assert(cuda::std::is_same<decltype(begin), const typename inplace_vector::iterator>::value, "");
+  assert(*begin == T(1));
+
+  const auto cbegin = range.cbegin();
+  static_assert(cuda::std::is_same<decltype(cbegin), const typename inplace_vector::const_iterator>::value, "");
+  assert(*cbegin == T(1));
+
+  const auto const_begin = const_range.begin();
+  static_assert(cuda::std::is_same<decltype(const_begin), const typename inplace_vector::const_iterator>::value, "");
+  assert(*const_begin == T(0));
+
+  const auto end = range.end();
+  static_assert(cuda::std::is_same<decltype(end), const typename inplace_vector::iterator>::value, "");
+  assert(*cuda::std::prev(end) == T(-1));
+
+  const auto cend = range.cend();
+  static_assert(cuda::std::is_same<decltype(cend), const typename inplace_vector::const_iterator>::value, "");
+  assert(*cuda::std::prev(cend) == T(-1));
+
+  const auto const_end = const_range.end();
+  static_assert(cuda::std::is_same<decltype(const_end), const typename inplace_vector::const_iterator>::value, "");
+  assert(*cuda::std::prev(const_end) == T(-42));
+
+  const auto rbegin = range.rbegin();
+  static_assert(cuda::std::is_same<decltype(rbegin), const typename inplace_vector::reverse_iterator>::value, "");
+  assert(*rbegin == T(-1));
+
+  const auto crbegin = range.crbegin();
+  static_assert(cuda::std::is_same<decltype(crbegin), const typename inplace_vector::const_reverse_iterator>::value,
+                "");
+  assert(*crbegin == T(-1));
+
+  const auto const_rbegin = const_range.rbegin();
+  static_assert(
+    cuda::std::is_same<decltype(const_rbegin), const typename inplace_vector::const_reverse_iterator>::value, "");
+  assert(*const_rbegin == T(-42));
+
+  const auto rend = range.rend();
+  static_assert(cuda::std::is_same<decltype(rend), const typename inplace_vector::reverse_iterator>::value, "");
+  assert(*cuda::std::prev(rend) == T(1));
+
+  const auto crend = range.crend();
+  static_assert(cuda::std::is_same<decltype(crend), const typename inplace_vector::const_reverse_iterator>::value, "");
+  assert(*cuda::std::prev(crend) == T(1));
+
+  const auto const_rend = const_range.rend();
+  static_assert(cuda::std::is_same<decltype(const_rend), const typename inplace_vector::const_reverse_iterator>::value,
+                "");
+  assert(*cuda::std::prev(const_rend) == T(0));
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/properties.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/properties.pass.cpp
new file mode 100644
index 0000000000..bbf901383d
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/properties.pass.cpp
@@ -0,0 +1,162 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/iterator>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+#include "types.h"
+
+__host__ __device__ void test()
+{
+  // Ensure that we pack the inplace_vector as tight as possible
+  static_assert(cuda::std::is_empty<cuda::std::inplace_vector<int, 0>>::value, "");
+  static_assert(cuda::std::is_empty<cuda::std::inplace_vector<Trivial, 0>>::value, "");
+  static_assert(cuda::std::is_empty<cuda::std::inplace_vector<NonTrivial, 0>>::value, "");
+
+  static_assert(cuda::std::is_trivial<cuda::std::inplace_vector<int, 0>>::value, "");
+  static_assert(cuda::std::is_trivial<cuda::std::inplace_vector<Trivial, 0>>::value, "");
+  static_assert(cuda::std::is_trivial<cuda::std::inplace_vector<NonTrivial, 0>>::value, "");
+
+  static_assert(sizeof(cuda::std::inplace_vector<char, 4>) == 4 + sizeof(cuda::std::uint8_t), "");
+  static_assert(sizeof(cuda::std::inplace_vector<char, cuda::std::numeric_limits<cuda::std::uint8_t>::max()>)
+                  == cuda::std::numeric_limits<cuda::std::uint8_t>::max() + sizeof(cuda::std::uint8_t),
+                "");
+  static_assert(sizeof(cuda::std::inplace_vector<char, cuda::std::numeric_limits<cuda::std::uint8_t>::max() + 1>)
+                  == cuda::std::numeric_limits<cuda::std::uint8_t>::max() + 1 + sizeof(cuda::std::uint16_t),
+                "");
+  static_assert(sizeof(cuda::std::inplace_vector<char, cuda::std::numeric_limits<cuda::std::uint16_t>::max()>)
+                  == cuda::std::numeric_limits<cuda::std::uint16_t>::max() + 1 + sizeof(cuda::std::uint16_t),
+                "");
+  static_assert(sizeof(cuda::std::inplace_vector<char, cuda::std::numeric_limits<cuda::std::uint16_t>::max() + 1>)
+                  == cuda::std::numeric_limits<cuda::std::uint16_t>::max() + 1 + sizeof(cuda::std::uint32_t),
+                "");
+
+#if !defined(TEST_COMPILER_MSVC) // too large array
+  // There is an overflow issue when using cuda::std::numeric_limits<cuda::std::uint32_t>::max() directly
+  constexpr size_t uint32_t_max = cuda::std::numeric_limits<cuda::std::uint32_t>::max();
+  static_assert(sizeof(cuda::std::inplace_vector<char, uint32_t_max>) == uint32_t_max + 1 + sizeof(cuda::std::uint32_t),
+                "");
+  static_assert(
+    sizeof(cuda::std::inplace_vector<char, uint32_t_max + 1>) == uint32_t_max + 1 + sizeof(cuda::std::uint64_t), "");
+#endif // !TEST_COMPILER_MSVC
+
+  // Check the type aliases
+  using inplace_vector = cuda::std::inplace_vector<int, 42>;
+  static_assert(cuda::std::is_same<int, typename inplace_vector::value_type>::value, "");
+  static_assert(cuda::std::is_same<cuda::std::size_t, typename inplace_vector::size_type>::value, "");
+  static_assert(cuda::std::is_same<cuda::std::ptrdiff_t, typename inplace_vector::difference_type>::value, "");
+  static_assert(cuda::std::is_same<int*, typename inplace_vector::pointer>::value, "");
+  static_assert(cuda::std::is_same<const int*, typename inplace_vector::const_pointer>::value, "");
+  static_assert(cuda::std::is_same<int&, typename inplace_vector::reference>::value, "");
+  static_assert(cuda::std::is_same<const int&, typename inplace_vector::const_reference>::value, "");
+  static_assert(cuda::std::is_same<int*, typename inplace_vector::iterator>::value, "");
+  static_assert(cuda::std::is_same<const int*, typename inplace_vector::const_iterator>::value, "");
+  static_assert(cuda::std::is_same<cuda::std::reverse_iterator<int*>, typename inplace_vector::reverse_iterator>::value,
+                "");
+  static_assert(
+    cuda::std::is_same<cuda::std::reverse_iterator<const int*>, typename inplace_vector::const_reverse_iterator>::value,
+    "");
+
+#if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+  static_assert(cuda::std::ranges::contiguous_range<inplace_vector>);
+#endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+
+  // Ensure we uphoold the guarantees about triviality in [inplace.vector.overview]
+  // * If is_trivially_copy_constructible_v<T> is true, then IV has a trivial copy constructor.
+  static_assert(
+    cuda::std::is_trivially_copy_constructible<cuda::std::inplace_vector<ThrowingDefaultConstruct, 42>>::value, "");
+  static_assert(
+    !cuda::std::is_trivially_copy_constructible<cuda::std::inplace_vector<ThrowingCopyConstructor, 42>>::value, "");
+  static_assert(
+    cuda::std::is_trivially_copy_constructible<cuda::std::inplace_vector<ThrowingMoveConstructor, 42>>::value, "");
+  static_assert(
+    cuda::std::is_trivially_copy_constructible<cuda::std::inplace_vector<ThrowingCopyAssignment, 42>>::value, "");
+  static_assert(
+    cuda::std::is_trivially_copy_constructible<cuda::std::inplace_vector<ThrowingMoveAssignment, 42>>::value, "");
+  static_assert(!cuda::std::is_trivially_copy_constructible<cuda::std::inplace_vector<NonTrivialDestructor, 42>>::value,
+                "");
+
+  // * If is_trivially_move_constructible_v<T> is true, then IV has a trivial move constructor
+  static_assert(
+    cuda::std::is_trivially_move_constructible<cuda::std::inplace_vector<ThrowingDefaultConstruct, 42>>::value, "");
+  static_assert(
+    cuda::std::is_trivially_move_constructible<cuda::std::inplace_vector<ThrowingCopyConstructor, 42>>::value, "");
+  static_assert(
+    !cuda::std::is_trivially_move_constructible<cuda::std::inplace_vector<ThrowingMoveConstructor, 42>>::value, "");
+  static_assert(
+    cuda::std::is_trivially_move_constructible<cuda::std::inplace_vector<ThrowingCopyAssignment, 42>>::value, "");
+  static_assert(
+    cuda::std::is_trivially_move_constructible<cuda::std::inplace_vector<ThrowingMoveAssignment, 42>>::value, "");
+  static_assert(!cuda::std::is_trivially_move_constructible<cuda::std::inplace_vector<NonTrivialDestructor, 42>>::value,
+                "");
+
+  // * If is_trivially_destructible_v<T> is true, then IV has a trivial destructor
+  static_assert(cuda::std::is_trivially_destructible<cuda::std::inplace_vector<ThrowingDefaultConstruct, 42>>::value,
+                "");
+  static_assert(cuda::std::is_trivially_destructible<cuda::std::inplace_vector<ThrowingCopyConstructor, 42>>::value,
+                "");
+  static_assert(cuda::std::is_trivially_destructible<cuda::std::inplace_vector<ThrowingMoveConstructor, 42>>::value,
+                "");
+  static_assert(cuda::std::is_trivially_destructible<cuda::std::inplace_vector<ThrowingCopyAssignment, 42>>::value, "");
+  static_assert(cuda::std::is_trivially_destructible<cuda::std::inplace_vector<ThrowingMoveAssignment, 42>>::value, "");
+  static_assert(!cuda::std::is_trivially_destructible<cuda::std::inplace_vector<NonTrivialDestructor, 42>>::value, "");
+
+  // * If is_trivially_destructible_v<T> is true,
+  // if is_trivially_copy_constructible_v<T> && is_trivially_copy_assignable_v<T> is true, then IV has a trivial copy
+  // assignment operator
+  static_assert(cuda::std::is_trivially_copy_assignable<cuda::std::inplace_vector<ThrowingDefaultConstruct, 42>>::value,
+                "");
+  static_assert(!cuda::std::is_trivially_copy_assignable<cuda::std::inplace_vector<ThrowingCopyConstructor, 42>>::value,
+                "");
+  static_assert(cuda::std::is_trivially_copy_assignable<cuda::std::inplace_vector<ThrowingMoveConstructor, 42>>::value,
+                "");
+  static_assert(!cuda::std::is_trivially_copy_assignable<cuda::std::inplace_vector<ThrowingCopyAssignment, 42>>::value,
+                "");
+  static_assert(cuda::std::is_trivially_copy_assignable<cuda::std::inplace_vector<ThrowingMoveAssignment, 42>>::value,
+                "");
+  static_assert(!cuda::std::is_trivially_copy_assignable<cuda::std::inplace_vector<NonTrivialDestructor, 42>>::value,
+                "");
+
+  // * If is_trivially_destructible_v<T> is true,
+  // if is_trivially_move_constructible_v<T> && is_trivially_move_assignable_v<T> is true, then IV has a trivial move
+  // assignment operator
+  static_assert(cuda::std::is_trivially_move_assignable<cuda::std::inplace_vector<ThrowingDefaultConstruct, 42>>::value,
+                "");
+  static_assert(cuda::std::is_trivially_move_assignable<cuda::std::inplace_vector<ThrowingCopyConstructor, 42>>::value,
+                "");
+  static_assert(!cuda::std::is_trivially_move_assignable<cuda::std::inplace_vector<ThrowingMoveConstructor, 42>>::value,
+                "");
+  static_assert(cuda::std::is_trivially_move_assignable<cuda::std::inplace_vector<ThrowingCopyAssignment, 42>>::value,
+                "");
+  static_assert(!cuda::std::is_trivially_move_assignable<cuda::std::inplace_vector<ThrowingMoveAssignment, 42>>::value,
+                "");
+  static_assert(!cuda::std::is_trivially_move_assignable<cuda::std::inplace_vector<NonTrivialDestructor, 42>>::value,
+                "");
+
+  // Implicit: * If is_trivially_destructible_v<T> is true, then IV has is_trivially_copyable
+  static_assert(cuda::std::is_trivially_copyable<cuda::std::inplace_vector<ThrowingDefaultConstruct, 42>>::value, "");
+  static_assert(!cuda::std::is_trivially_copyable<cuda::std::inplace_vector<ThrowingCopyConstructor, 42>>::value, "");
+  static_assert(!cuda::std::is_trivially_copyable<cuda::std::inplace_vector<ThrowingMoveConstructor, 42>>::value, "");
+  static_assert(!cuda::std::is_trivially_copyable<cuda::std::inplace_vector<ThrowingCopyAssignment, 42>>::value, "");
+  static_assert(!cuda::std::is_trivially_copyable<cuda::std::inplace_vector<ThrowingMoveAssignment, 42>>::value, "");
+  static_assert(!cuda::std::is_trivially_copyable<cuda::std::inplace_vector<NonTrivialDestructor, 42>>::value, "");
+}
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp
new file mode 100644
index 0000000000..b664344354
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp
@@ -0,0 +1,300 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+template <class T>
+__host__ __device__ constexpr void test_resize()
+{
+  constexpr size_t max_capacity = 42ull;
+  using inplace_vector          = cuda::std::inplace_vector<T, max_capacity>;
+
+  { // inplace_vector<T, 0>::resize with a size
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.resize(0);
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, 0>::resize with a size and value
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.resize(0, T{42});
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::resize with a size, shrinking
+    inplace_vector vec{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    vec.resize(1);
+    assert(equal_range(vec, cuda::std::array<T, 1>{T(1)}));
+  }
+
+  { // inplace_vector<T, N>::resize with a size and value, shrinking
+    inplace_vector vec{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    vec.resize(1, T(5));
+    assert(equal_range(vec, cuda::std::array<T, 1>{T(1)}));
+  }
+
+  { // inplace_vector<T, N>::resize with a size, growing, new elements are value initialized
+    inplace_vector vec{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    vec.resize(8);
+    assert(equal_range(vec, cuda::std::array<T, 8>{T(1), T(1337), T(42), T(12), T(0), T(-1), T(0), T(0)}));
+  }
+
+  { // inplace_vector<T, N>::resize with a size and value, growing, new elements are copied
+    inplace_vector vec{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    vec.resize(8, T(5));
+    assert(equal_range(vec, cuda::std::array<T, 8>{T(1), T(1337), T(42), T(12), T(0), T(-1), T(5), T(5)}));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_clear()
+{
+  constexpr size_t max_capacity = 42ull;
+  using inplace_vector          = cuda::std::inplace_vector<T, max_capacity>;
+
+  { // inplace_vector<T, 0>::clear
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.clear();
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::clear, from empty
+    inplace_vector vec{};
+    vec.clear();
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::clear
+    inplace_vector vec{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    vec.clear();
+    assert(vec.empty());
+  }
+}
+
+struct is_one
+{
+  template <class T>
+  __host__ __device__ constexpr bool operator()(const T& val) const noexcept
+  {
+    return val == T(1);
+  }
+};
+
+template <class T>
+__host__ __device__ constexpr void test_pop_back()
+{
+  constexpr size_t max_capacity = 42ull;
+  using inplace_vector          = cuda::std::inplace_vector<T, max_capacity>;
+
+  { // inplace_vector<T, N>::pop_back
+    inplace_vector vec{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    vec.pop_back();
+    assert(equal_range(vec, cuda::std::array<T, 5>{T(1), T(1337), T(42), T(12), T(0)}));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_erase()
+{
+  constexpr size_t max_capacity = 42ull;
+  using inplace_vector          = cuda::std::inplace_vector<T, max_capacity>;
+
+  { // inplace_vector<T, N>::erase(iter)
+    inplace_vector vec{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    auto res = vec.erase(vec.begin() + 1);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::iterator>::value, "");
+    assert(*res == T(42));
+    assert(equal_range(vec, cuda::std::array<T, 5>{T(1), T(42), T(12), T(0), T(-1)}));
+  }
+
+  { // inplace_vector<T, N>::erase(iter, iter), iterators are equal
+    inplace_vector vec{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    auto res = vec.erase(vec.begin() + 1, vec.begin() + 1);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::iterator>::value, "");
+    assert(*res == T(1337));
+    assert(equal_range(vec, cuda::std::array<T, 6>{T(1), T(1337), T(42), T(12), T(0), T(-1)}));
+  }
+
+  { // inplace_vector<T, N>::erase(iter, iter)
+    inplace_vector vec{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    auto res = vec.erase(vec.begin() + 1, vec.begin() + 3);
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::iterator>::value, "");
+    assert(*res == T(12));
+    assert(equal_range(vec, cuda::std::array<T, 4>{T(1), T(12), T(0), T(-1)}));
+  }
+
+  { // erase(inplace_vector<T, 0>, value)
+    cuda::std::inplace_vector<T, 0> vec{};
+    auto res = erase(vec, T(1));
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::size_type>::value, "");
+    assert(res == 0);
+    assert(vec.empty());
+  }
+
+  { // erase_if(inplace_vector<T, 0>, pred)
+    cuda::std::inplace_vector<T, 0> vec{};
+    auto res = erase_if(vec, is_one{});
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::size_type>::value, "");
+    assert(res == 0);
+    assert(vec.empty());
+  }
+
+  { // erase(inplace_vector<T, N>, value)
+    inplace_vector vec{T(1), T(1337), T(1), T(12), T(0), T(-1)};
+    auto res = erase(vec, T(1));
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::size_type>::value, "");
+    assert(res == 2);
+    assert(equal_range(vec, cuda::std::array<T, 4>{T(1337), T(12), T(0), T(-1)}));
+  }
+
+  { // erase_if(inplace_vector<T, N>, pred)
+    inplace_vector vec{T(1), T(1337), T(1), T(12), T(0), T(-1)};
+    auto res = erase_if(vec, is_one{});
+    static_assert(cuda::std::is_same<decltype(res), typename inplace_vector::size_type>::value, "");
+    assert(res == 2);
+    assert(equal_range(vec, cuda::std::array<T, 4>{T(1337), T(12), T(0), T(-1)}));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_shrink_to_fit()
+{
+  constexpr size_t max_capacity = 42ull;
+  using inplace_vector          = cuda::std::inplace_vector<T, max_capacity>;
+
+  { // inplace_vector<T, 0>::shrink_to_fit
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.shrink_to_fit();
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::shrink_to_fit
+    inplace_vector vec{T(1), T(1337), T(1), T(12), T(0), T(-1)};
+    vec.shrink_to_fit();
+    assert(equal_range(vec, cuda::std::array<T, 6>{T(1), T(1337), T(1), T(12), T(0), T(-1)}));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test_reserve()
+{
+  constexpr size_t max_capacity = 42ull;
+  using inplace_vector          = cuda::std::inplace_vector<T, max_capacity>;
+
+  { // inplace_vector<T, 0>::reserve
+    cuda::std::inplace_vector<T, 0> vec{};
+    vec.reserve(0);
+    assert(vec.empty());
+  }
+
+  { // inplace_vector<T, N>::reserve
+    inplace_vector vec{T(1), T(1337), T(1), T(12), T(0), T(-1)};
+    vec.reserve(13);
+    assert(equal_range(vec, cuda::std::array<T, 6>{T(1), T(1337), T(1), T(12), T(0), T(-1)}));
+  }
+}
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  test_resize<T>();
+  test_clear<T>();
+  test_pop_back<T>();
+  test_erase<T>();
+  test_shrink_to_fit<T>();
+  test_reserve<T>();
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+  }
+
+  return true;
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+void test_exceptions()
+{ // resize and reserve throw std::bad_alloc
+  {
+    using inplace_vector = cuda::std::inplace_vector<int, 42>;
+    inplace_vector too_small{};
+    try
+    {
+      too_small.resize(1337);
+      assert(false);
+    }
+    catch (const std::bad_alloc&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      too_small.resize(1337, 5);
+      assert(false);
+    }
+    catch (const std::bad_alloc&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      too_small.reserve(1337);
+      assert(false);
+    }
+    catch (const std::bad_alloc&)
+    {}
+    catch (...)
+    {
+      assert(false);
+    }
+  }
+}
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  NV_IF_TARGET(NV_IS_HOST, (test_exceptions();))
+#endif // !TEST_HAS_NO_EXCEPTIONS
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp
new file mode 100644
index 0000000000..019c6c8cbc
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/initializer_list>
+#include <cuda/std/inplace_vector>
+#include <cuda/std/iterator>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "types.h"
+
+template <class T>
+__host__ __device__ constexpr void test()
+{
+  { // inplace_vector<T, 0> can be swapped
+    using inplace_vector = cuda::std::inplace_vector<T, 0>;
+    inplace_vector empty{};
+    empty.swap(empty);
+    static_assert(noexcept(empty.swap(empty)), "");
+    swap(empty, empty);
+    static_assert(noexcept(swap(empty, empty)), "");
+  }
+
+  { // inplace_vector<T, N> can be swapped
+    using inplace_vector = cuda::std::inplace_vector<T, 42>;
+    cuda::std::array<T, 6> expected_left{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    cuda::std::array<T, 5> expected_right{T(0), T(42), T(1337), T(42), T(5)};
+
+    inplace_vector left{T(1), T(1337), T(42), T(12), T(0), T(-1)};
+    inplace_vector right{T(0), T(42), T(1337), T(42), T(5)};
+
+    left.swap(right);
+    constexpr bool nothrow_swap =
+      cuda::std::is_nothrow_swappable<T>::value && cuda::std::is_nothrow_move_constructible<T>::value;
+    static_assert(noexcept(left.swap(right)) == nothrow_swap, "");
+    assert(equal_range(left, expected_right));
+    assert(equal_range(right, expected_left));
+
+    swap(left, right);
+#if !defined(TEST_COMPILER_MSVC_2017)
+    static_assert(noexcept(swap(left, right)) == nothrow_swap, "");
+#endif // !TEST_COMPILER_MSVC_2017
+    assert(equal_range(left, expected_left));
+    assert(equal_range(right, expected_right));
+  }
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<int>();
+  test<Trivial>();
+
+  if (!cuda::std::__libcpp_is_constant_evaluated())
+  {
+    test<NonTrivial>();
+    test<NonTrivialDestructor>();
+    test<ThrowingDefaultConstruct>();
+    test<ThrowingMoveConstructor>();
+    test<ThrowingSwap>();
+  }
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+  static_assert(test(), "");
+#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/types.h b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/types.h
new file mode 100644
index 0000000000..2133453298
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/types.h
@@ -0,0 +1,353 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_CONTAINER_SEQUENCES_INPLACE_VECTOR_TYPES_H
+#define TEST_CONTAINER_SEQUENCES_INPLACE_VECTOR_TYPES_H
+
+#include <cuda/std/array>
+#include <cuda/std/type_traits>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+
+struct Trivial
+{
+  int val_;
+
+  Trivial() = default;
+  __host__ __device__ constexpr Trivial(const int val) noexcept
+      : val_(val)
+  {}
+
+  __host__ __device__ friend constexpr bool operator==(const Trivial& lhs, const Trivial& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  __host__ __device__ friend constexpr bool operator<(const Trivial& lhs, const Trivial& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+};
+
+struct NonTrivial
+{
+  int val_;
+
+  __host__ __device__ constexpr NonTrivial() noexcept
+      : val_(0)
+  {}
+  __host__ __device__ constexpr NonTrivial(const int val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ friend constexpr bool operator==(const NonTrivial& lhs, const NonTrivial& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  __host__ __device__ friend constexpr bool operator<(const NonTrivial& lhs, const NonTrivial& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+};
+
+struct NonTrivialDestructor
+{
+  int val_;
+
+  __host__ __device__ NonTrivialDestructor() noexcept
+      : val_(0)
+  {}
+  __host__ __device__ NonTrivialDestructor(const int val) noexcept
+      : val_(val)
+  {}
+  NonTrivialDestructor(const NonTrivialDestructor&)            = default;
+  NonTrivialDestructor(NonTrivialDestructor&&)                 = default;
+  NonTrivialDestructor& operator=(const NonTrivialDestructor&) = default;
+  NonTrivialDestructor& operator=(NonTrivialDestructor&&)      = default;
+  __host__ __device__ ~NonTrivialDestructor() noexcept {}
+  __host__ __device__ friend bool operator==(const NonTrivialDestructor& lhs, const NonTrivialDestructor& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  __host__ __device__ friend bool operator<(const NonTrivialDestructor& lhs, const NonTrivialDestructor& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+};
+static_assert(!cuda::std::is_trivially_copy_constructible<NonTrivialDestructor>::value, "");
+static_assert(!cuda::std::is_trivially_move_constructible<NonTrivialDestructor>::value, "");
+static_assert(cuda::std::is_trivially_copy_assignable<NonTrivialDestructor>::value, "");
+static_assert(cuda::std::is_trivially_move_assignable<NonTrivialDestructor>::value, "");
+
+struct ThrowingDefaultConstruct
+{
+  int val_;
+
+  __host__ __device__ constexpr ThrowingDefaultConstruct() noexcept(false)
+      : val_(0)
+  {}
+  __host__ __device__ constexpr ThrowingDefaultConstruct(const int val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ friend constexpr bool
+  operator==(const ThrowingDefaultConstruct& lhs, const ThrowingDefaultConstruct& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  __host__ __device__ friend constexpr bool
+  operator<(const ThrowingDefaultConstruct& lhs, const ThrowingDefaultConstruct& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+};
+static_assert(cuda::std::is_trivially_copy_constructible<ThrowingDefaultConstruct>::value, "");
+static_assert(cuda::std::is_trivially_move_constructible<ThrowingDefaultConstruct>::value, "");
+static_assert(cuda::std::is_trivially_copy_assignable<ThrowingDefaultConstruct>::value, "");
+static_assert(cuda::std::is_trivially_move_assignable<ThrowingDefaultConstruct>::value, "");
+#if !defined(TEST_COMPILER_GCC) || __GNUC__ >= 10
+static_assert(!cuda::std::is_nothrow_default_constructible<ThrowingDefaultConstruct>::value, "");
+#endif // !TEST_COMPILER_GCC < 10
+
+struct ThrowingCopyConstructor
+{
+  int val_;
+
+  __host__ __device__ ThrowingCopyConstructor() noexcept
+      : val_(0)
+  {}
+  __host__ __device__ ThrowingCopyConstructor(const int val) noexcept
+      : val_(val)
+  {}
+
+  __host__ __device__ ThrowingCopyConstructor(const ThrowingCopyConstructor& other) noexcept(false)
+      : val_(other.val_)
+  {}
+  ThrowingCopyConstructor(ThrowingCopyConstructor&&)                 = default;
+  ThrowingCopyConstructor& operator=(const ThrowingCopyConstructor&) = default;
+  ThrowingCopyConstructor& operator=(ThrowingCopyConstructor&&)      = default;
+
+  __host__ __device__ friend bool
+  operator==(const ThrowingCopyConstructor& lhs, const ThrowingCopyConstructor& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  __host__ __device__ friend bool
+  operator<(const ThrowingCopyConstructor& lhs, const ThrowingCopyConstructor& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+};
+static_assert(!cuda::std::is_trivially_copy_constructible<ThrowingCopyConstructor>::value, "");
+static_assert(cuda::std::is_trivially_move_constructible<ThrowingCopyConstructor>::value, "");
+static_assert(cuda::std::is_trivially_copy_assignable<ThrowingCopyConstructor>::value, "");
+static_assert(cuda::std::is_trivially_move_assignable<ThrowingCopyConstructor>::value, "");
+static_assert(!cuda::std::is_nothrow_copy_constructible<ThrowingCopyConstructor>::value, "");
+
+struct ThrowingMoveConstructor
+{
+  int val_;
+
+  __host__ __device__ ThrowingMoveConstructor() noexcept
+      : val_(0)
+  {}
+  __host__ __device__ ThrowingMoveConstructor(const int val) noexcept
+      : val_(val)
+  {}
+
+  __host__ __device__ ThrowingMoveConstructor(ThrowingMoveConstructor&& other) noexcept(false)
+      : val_(other.val_)
+  {}
+  ThrowingMoveConstructor(const ThrowingMoveConstructor&)            = default;
+  ThrowingMoveConstructor& operator=(const ThrowingMoveConstructor&) = default;
+  ThrowingMoveConstructor& operator=(ThrowingMoveConstructor&&)      = default;
+
+  __host__ __device__ friend bool
+  operator==(const ThrowingMoveConstructor& lhs, const ThrowingMoveConstructor& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  __host__ __device__ friend bool
+  operator<(const ThrowingMoveConstructor& lhs, const ThrowingMoveConstructor& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+};
+static_assert(cuda::std::is_trivially_copy_constructible<ThrowingMoveConstructor>::value, "");
+static_assert(!cuda::std::is_trivially_move_constructible<ThrowingMoveConstructor>::value, "");
+static_assert(cuda::std::is_trivially_copy_assignable<ThrowingMoveConstructor>::value, "");
+static_assert(cuda::std::is_trivially_move_assignable<ThrowingMoveConstructor>::value, "");
+static_assert(!cuda::std::is_nothrow_move_constructible<ThrowingMoveConstructor>::value, "");
+
+struct ThrowingCopyAssignment
+{
+  int val_;
+
+  __host__ __device__ ThrowingCopyAssignment() noexcept
+      : val_(0)
+  {}
+  __host__ __device__ ThrowingCopyAssignment(const int val) noexcept
+      : val_(val)
+  {}
+
+  ThrowingCopyAssignment(const ThrowingCopyAssignment&) = default;
+  ThrowingCopyAssignment(ThrowingCopyAssignment&&)      = default;
+  __host__ __device__ ThrowingCopyAssignment& operator=(const ThrowingCopyAssignment& other) noexcept(false)
+  {
+    val_ = other.val_;
+    return *this;
+  }
+  ThrowingCopyAssignment& operator=(ThrowingCopyAssignment&&) = default;
+
+  __host__ __device__ friend bool
+  operator==(const ThrowingCopyAssignment& lhs, const ThrowingCopyAssignment& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  __host__ __device__ friend bool
+  operator<(const ThrowingCopyAssignment& lhs, const ThrowingCopyAssignment& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+};
+static_assert(cuda::std::is_trivially_copy_constructible<ThrowingCopyAssignment>::value, "");
+static_assert(cuda::std::is_trivially_move_constructible<ThrowingCopyAssignment>::value, "");
+static_assert(!cuda::std::is_trivially_copy_assignable<ThrowingCopyAssignment>::value, "");
+static_assert(cuda::std::is_trivially_move_assignable<ThrowingCopyAssignment>::value, "");
+static_assert(!cuda::std::is_nothrow_copy_assignable<ThrowingCopyAssignment>::value, "");
+
+struct ThrowingMoveAssignment
+{
+  int val_;
+
+  __host__ __device__ ThrowingMoveAssignment() noexcept
+      : val_(0)
+  {}
+  __host__ __device__ ThrowingMoveAssignment(const int val) noexcept
+      : val_(val)
+  {}
+
+  ThrowingMoveAssignment(ThrowingMoveAssignment&&)                 = default;
+  ThrowingMoveAssignment(const ThrowingMoveAssignment&)            = default;
+  ThrowingMoveAssignment& operator=(const ThrowingMoveAssignment&) = default;
+  __host__ __device__ ThrowingMoveAssignment& operator=(ThrowingMoveAssignment&& other) noexcept(false)
+  {
+    val_ = other.val_;
+    return *this;
+  }
+
+  __host__ __device__ friend bool
+  operator==(const ThrowingMoveAssignment& lhs, const ThrowingMoveAssignment& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  __host__ __device__ friend bool
+  operator<(const ThrowingMoveAssignment& lhs, const ThrowingMoveAssignment& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+};
+static_assert(cuda::std::is_trivially_copy_constructible<ThrowingMoveAssignment>::value, "");
+static_assert(cuda::std::is_trivially_move_constructible<ThrowingMoveAssignment>::value, "");
+static_assert(cuda::std::is_trivially_copy_assignable<ThrowingMoveAssignment>::value, "");
+static_assert(!cuda::std::is_trivially_move_assignable<ThrowingMoveAssignment>::value, "");
+static_assert(!cuda::std::is_nothrow_move_assignable<ThrowingMoveAssignment>::value, "");
+
+struct ThrowingSwap
+{
+  int val_;
+
+  __host__ __device__ ThrowingSwap() noexcept
+      : val_(0)
+  {}
+  __host__ __device__ ThrowingSwap(const int val) noexcept
+      : val_(val)
+  {}
+
+  __host__ __device__ friend bool operator==(const ThrowingSwap& lhs, const ThrowingSwap& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+
+  __host__ __device__ void swap(ThrowingSwap& other) noexcept(false)
+  {
+    cuda::std::swap(val_, other.val_);
+  }
+};
+static_assert(!cuda::std::is_nothrow_swappable<ThrowingMoveConstructor>::value, "");
+
+#if TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+template <class T, size_t Capacity>
+struct input_range
+{
+  cuda::std::array<T, Capacity> data;
+  cpp17_input_iterator<T*> end_{data.data() + Capacity};
+
+  __host__ __device__ constexpr cpp17_input_iterator<T*> begin() noexcept
+  {
+    return cpp17_input_iterator<T*>{data.begin()};
+  }
+
+  __host__ __device__ constexpr sentinel_wrapper<cpp17_input_iterator<T*>> end() noexcept
+  {
+    return sentinel_wrapper<cpp17_input_iterator<T*>>{end_};
+  }
+};
+static_assert(cuda::std::ranges::input_range<input_range<int, 4>>);
+static_assert(!cuda::std::ranges::forward_range<input_range<int, 4>>);
+static_assert(!cuda::std::ranges::common_range<input_range<int, 4>>);
+static_assert(!cuda::std::ranges::sized_range<input_range<int, 4>>);
+
+template <class T, size_t Capacity>
+struct uncommon_range
+{
+  cuda::std::array<T, Capacity> data;
+  forward_iterator<T*> end_{data.data() + Capacity};
+
+  __host__ __device__ constexpr forward_iterator<T*> begin() noexcept
+  {
+    return forward_iterator<T*>{data.begin()};
+  }
+
+  __host__ __device__ constexpr sentinel_wrapper<forward_iterator<T*>> end() noexcept
+  {
+    return sentinel_wrapper<forward_iterator<T*>>{end_};
+  }
+};
+static_assert(cuda::std::ranges::forward_range<uncommon_range<int, 4>>);
+static_assert(!cuda::std::ranges::common_range<uncommon_range<int, 4>>);
+static_assert(!cuda::std::ranges::sized_range<uncommon_range<int, 4>>);
+
+template <class T, size_t Capacity>
+struct sized_uncommon_range
+{
+  cuda::std::array<T, Capacity> data;
+  forward_iterator<T*> end_{data.data() + Capacity};
+
+  __host__ __device__ constexpr forward_iterator<T*> begin() noexcept
+  {
+    return forward_iterator<T*>{data.begin()};
+  }
+
+  __host__ __device__ constexpr sized_sentinel<forward_iterator<T*>> end() noexcept
+  {
+    return sized_sentinel<forward_iterator<T*>>{end_};
+  }
+};
+static_assert(cuda::std::ranges::forward_range<sized_uncommon_range<int, 4>>);
+static_assert(!cuda::std::ranges::common_range<sized_uncommon_range<int, 4>>);
+static_assert(cuda::std::ranges::sized_range<sized_uncommon_range<int, 4>>);
+#endif // TEST_STD_VER >= 2017 && !defined(TEST_COMPILER_MSVC_2017)
+
+// Helper function to compare two ranges
+template <class Range1, class Range2>
+__host__ __device__ constexpr bool equal_range(const Range1& range1, const Range2& range2)
+{
+  return cuda::std::equal(range1.begin(), range1.end(), range2.begin(), range2.end());
+}
+
+#endif // TEST_CONTAINER_SEQUENCES_INPLACE_VECTOR_TYPES_H
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp
index fd84ddcc51..809ae51f42 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/foo_customizations.hpp
@@ -22,20 +22,19 @@ struct foo_accessor
   using reference        = T&;
   using data_handle_type = foo_ptr<T>;
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr foo_accessor(int* ptr = nullptr) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr foo_accessor(int* ptr = nullptr) noexcept
   {
     flag = ptr;
   }
 
   template <class OtherElementType>
-  __MDSPAN_INLINE_FUNCTION constexpr foo_accessor(cuda::std::default_accessor<OtherElementType>) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr foo_accessor(cuda::std::default_accessor<OtherElementType>) noexcept
   {
     flag = nullptr;
   }
 
   template <class OtherElementType>
-  __MDSPAN_INLINE_FUNCTION constexpr foo_accessor(foo_accessor<OtherElementType> other) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr foo_accessor(foo_accessor<OtherElementType> other) noexcept
   {
     flag = other.flag;
   }
@@ -86,8 +85,8 @@ class layout_foo::mapping
 public:
   //--------------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept               = default;
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr mapping() noexcept               = default;
+  _CCCL_HIDE_FROM_ABI constexpr mapping(mapping const&) noexcept = default;
 
   _CCCL_HOST_DEVICE constexpr mapping(extents_type const& __exts) noexcept
       : __extents(__exts)
@@ -97,7 +96,7 @@ class layout_foo::mapping
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due to
                                                                                                  // comma
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       : __extents(other.extents())
   {
@@ -111,7 +110,7 @@ class layout_foo::mapping
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due
                                                                                                  // to comma
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     cuda::std::layout_right::mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       : __extents(other.extents())
   {}
@@ -121,7 +120,7 @@ class layout_foo::mapping
                        && (extents_type::rank() <= 1))
   __MDSPAN_CONDITIONAL_EXPLICIT((!cuda::std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due to
                                                                                                  // comma
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     cuda::std::layout_left::mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       : __extents(other.extents())
   {}
@@ -129,7 +128,7 @@ class layout_foo::mapping
   _LIBCUDACXX_TEMPLATE(class OtherExtents)
   _LIBCUDACXX_REQUIRES(_CCCL_TRAIT(cuda::std::is_constructible, extents_type, OtherExtents))
   __MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
-  __MDSPAN_INLINE_FUNCTION constexpr mapping(
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr mapping(
     cuda::std::layout_stride::mapping<OtherExtents> const& other) // NOLINT(google-explicit-constructor)
       : __extents(other.extents())
   {
@@ -145,17 +144,14 @@ class layout_foo::mapping
                  }))
   }
 
-  __MDSPAN_INLINE_FUNCTION_DEFAULTED __MDSPAN_CONSTEXPR_14_DEFAULTED mapping&
-  operator=(mapping const&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI __MDSPAN_CONSTEXPR_14_DEFAULTED mapping& operator=(mapping const&) noexcept = default;
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr const extents_type& extents() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr const extents_type& extents() const noexcept
   {
     return __extents;
   }
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr index_type required_span_size() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type required_span_size() const noexcept
   {
     index_type value = 1;
     for (rank_type r = 0; r != extents_type::rank(); ++r)
@@ -167,51 +163,49 @@ class layout_foo::mapping
 
   //--------------------------------------------------------------------------------
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr index_type operator()() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type operator()() const noexcept
   {
     return index_type(0);
   }
 
   template <class Indx0>
-  __MDSPAN_INLINE_FUNCTION constexpr index_type operator()(Indx0 idx0) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type operator()(Indx0 idx0) const noexcept
   {
     return static_cast<index_type>(idx0);
   }
 
   template <class Indx0, class Indx1>
-  __MDSPAN_INLINE_FUNCTION constexpr index_type operator()(Indx0 idx0, Indx1 idx1) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type operator()(Indx0 idx0, Indx1 idx1) const noexcept
   {
     return static_cast<index_type>(idx0 * __extents.extent(0) + idx1);
   }
 
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_unique() noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_exhaustive() noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool is_always_strided() noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_unique() const noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_exhaustive() const noexcept
   {
     return true;
   }
-  __MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_strided() const noexcept
   {
     return true;
   }
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr index_type stride(rank_type i) const noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr index_type stride(rank_type i) const noexcept
   {
     index_type value = 1;
     for (rank_type r = extents_type::rank() - 1; r > i; r--)
@@ -222,7 +216,7 @@ class layout_foo::mapping
   }
 
   template <class OtherExtents>
-  __MDSPAN_INLINE_FUNCTION friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator==(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept
   {
     return lhs.extents() == rhs.extents();
@@ -231,7 +225,7 @@ class layout_foo::mapping
   // In C++ 20 the not equal exists if equal is found
 #if !(__MDSPAN_HAS_CXX_20)
   template <class OtherExtents>
-  __MDSPAN_INLINE_FUNCTION friend constexpr bool
+  _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool
   operator!=(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept
   {
     return lhs.extents() != rhs.extents();
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_accessor.hpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_accessor.hpp
index bf21e364c7..fe875fb22d 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_accessor.hpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_accessor.hpp
@@ -16,14 +16,13 @@ struct my_accessor
   using reference        = T&;
   using data_handle_type = foo_ptr<T>;
 
-  __MDSPAN_INLINE_FUNCTION
-  constexpr my_accessor(int* ptr) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr my_accessor(int* ptr) noexcept
   {
     flag = ptr;
   }
 
   template <class OtherElementType>
-  __MDSPAN_INLINE_FUNCTION constexpr my_accessor(my_accessor<OtherElementType> other) noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr my_accessor(my_accessor<OtherElementType> other) noexcept
   {
     flag = other.flag;
   }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.exception/exception.terminate/terminate/terminate.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.exception/exception.terminate/terminate/terminate.runfail.cpp
similarity index 97%
rename from libcudacxx/test/libcudacxx/std/language.support/support.exception/exception.terminate/terminate/terminate.pass.cpp
rename to libcudacxx/test/libcudacxx/std/language.support/support.exception/exception.terminate/terminate/terminate.runfail.cpp
index 6fb694d742..ffd29c9740 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.exception/exception.terminate/terminate/terminate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.exception/exception.terminate/terminate/terminate.runfail.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 // test terminate
+// UNSUPPORTED: no_execute
 // UNSUPPORTED: nvrtc
 
 #include <cuda/std/__exception/terminate.h>
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
index 78e9909e99..6221cd6ed5 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
@@ -52,11 +52,7 @@ int main(int, char**)
   test<float, true>();
   test<double, true>();
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-#  if (defined(__ppc__) || defined(__ppc64__) || defined(__PPC64__))
-  test<long double, false>();
-#  else
   test<long double, true>();
-#  endif
 #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
 
   return 0;
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.compile.pass.cpp
new file mode 100644
index 0000000000..5e1b3a5d26
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.compile.pass.cpp
@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/bit>
+//
+// template<class To, class From>
+//   constexpr To bit_cast(const From& from) noexcept;
+
+// This test makes sure that std::bit_cast fails when any of the following
+// constraints are violated:
+//
+//      (1.1) sizeof(To) == sizeof(From) is true;
+//      (1.2) is_trivially_copyable_v<To> is true;
+//      (1.3) is_trivially_copyable_v<From> is true.
+//
+// Also check that it's ill-formed when the return type would be
+// ill-formed, even though that is not explicitly mentioned in the
+// specification (but it can be inferred from the synopsis).
+
+#include <cuda/std/bit>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+template <class To, class From, class = void>
+struct bit_cast_is_valid : cuda::std::false_type
+{};
+
+template <class To, class From>
+struct bit_cast_is_valid<To, From, decltype(cuda::std::bit_cast<To>(cuda::std::declval<const From&>()))>
+    : cuda::std::is_same<To, decltype(cuda::std::bit_cast<To>(cuda::std::declval<const From&>()))>
+{};
+
+// Types are not the same size
+namespace ns1
+{
+struct To
+{
+  char a;
+};
+struct From
+{
+  char a;
+  char b;
+};
+static_assert(!bit_cast_is_valid<To, From>::value, "");
+static_assert(!bit_cast_is_valid<From&, From>::value, "");
+} // namespace ns1
+
+// To is not trivially copyable
+namespace ns2
+{
+struct To
+{
+  char a;
+  __host__ __device__ To(To const&);
+};
+struct From
+{
+  char a;
+};
+static_assert(!bit_cast_is_valid<To, From>::value, "");
+} // namespace ns2
+
+// From is not trivially copyable
+namespace ns3
+{
+struct To
+{
+  char a;
+};
+struct From
+{
+  char a;
+  __host__ __device__ From(From const&);
+};
+static_assert(!bit_cast_is_valid<To, From>::value, "");
+} // namespace ns3
+
+// The return type is ill-formed
+namespace ns4
+{
+struct From
+{
+  char a;
+  char b;
+};
+static_assert(!bit_cast_is_valid<char[2], From>::value, "");
+static_assert(!bit_cast_is_valid<int(), From>::value, "");
+} // namespace ns4
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp
new file mode 100644
index 0000000000..a44eda0357
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp
@@ -0,0 +1,351 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/bit>
+//
+// template<class To, class From>
+//   constexpr To bit_cast(const From& from) noexcept;
+
+#include <cuda/std/array>
+#include <cuda/std/bit>
+#include <cuda/std/cassert>
+#include <cuda/std/cmath>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+
+#include "test_macros.h"
+
+__host__ __device__ cuda::std::size_t test_memcmp(void* lhs, void* rhs, size_t bytes) noexcept
+{
+  const unsigned char* clhs = (const unsigned char*) lhs;
+  const unsigned char* crhs = (const unsigned char*) rhs;
+
+  for (; bytes > 0; --bytes)
+  {
+    if (*clhs++ != *crhs++)
+    {
+      return clhs[-1] < crhs[-1] ? -1 : 1;
+    }
+  }
+  return 0;
+}
+
+// cuda::std::bit_cast does not preserve padding bits, so if T has padding bits,
+// the results might not memcmp cleanly.
+template <bool HasUniqueObjectRepresentations = true, typename T>
+__host__ __device__ void test_roundtrip_through_buffer(T from)
+{
+  struct Buffer
+  {
+    char buffer[sizeof(T)];
+  };
+  Buffer middle  = cuda::std::bit_cast<Buffer>(from);
+  T to           = cuda::std::bit_cast<T>(middle);
+  Buffer middle2 = cuda::std::bit_cast<Buffer>(to);
+
+  assert((from == to) == (from == from)); // because NaN
+
+  _CCCL_IF_CONSTEXPR (HasUniqueObjectRepresentations)
+  {
+    assert(test_memcmp(&from, &middle, sizeof(T)) == 0);
+    assert(test_memcmp(&to, &middle, sizeof(T)) == 0);
+    assert(test_memcmp(&middle, &middle2, sizeof(T)) == 0);
+  }
+}
+
+template <bool HasUniqueObjectRepresentations = true, typename T>
+__host__ __device__ void test_roundtrip_through_nested_T(T from)
+{
+  struct Nested
+  {
+    T x;
+  };
+  static_assert(sizeof(Nested) == sizeof(T), "");
+
+  Nested middle  = cuda::std::bit_cast<Nested>(from);
+  T to           = cuda::std::bit_cast<T>(middle);
+  Nested middle2 = cuda::std::bit_cast<Nested>(to);
+
+  assert((from == to) == (from == from)); // because NaN
+
+  _CCCL_IF_CONSTEXPR (HasUniqueObjectRepresentations)
+  {
+    assert(test_memcmp(&from, &middle, sizeof(T)) == 0);
+    assert(test_memcmp(&to, &middle, sizeof(T)) == 0);
+    assert(test_memcmp(&middle, &middle2, sizeof(T)) == 0);
+  }
+}
+
+template <typename Intermediate, bool HasUniqueObjectRepresentations = true, typename T>
+__host__ __device__ void test_roundtrip_through(T from)
+{
+  static_assert(sizeof(Intermediate) == sizeof(T), "");
+
+  Intermediate middle  = cuda::std::bit_cast<Intermediate>(from);
+  T to                 = cuda::std::bit_cast<T>(middle);
+  Intermediate middle2 = cuda::std::bit_cast<Intermediate>(to);
+
+  assert((from == to) == (from == from)); // because NaN
+
+  _CCCL_IF_CONSTEXPR (HasUniqueObjectRepresentations)
+  {
+    assert(test_memcmp(&from, &middle, sizeof(T)) == 0);
+    assert(test_memcmp(&to, &middle, sizeof(T)) == 0);
+    assert(test_memcmp(&middle, &middle2, sizeof(T)) == 0);
+  }
+}
+
+template <typename T>
+__host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 10> generate_signed_integral_values()
+{
+  return {cuda::std::numeric_limits<T>::min(),
+          cuda::std::numeric_limits<T>::min() + 1,
+          static_cast<T>(-2),
+          static_cast<T>(-1),
+          static_cast<T>(0),
+          static_cast<T>(1),
+          static_cast<T>(2),
+          static_cast<T>(3),
+          cuda::std::numeric_limits<T>::max() - 1,
+          cuda::std::numeric_limits<T>::max()};
+}
+
+template <typename T>
+__host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 6> generate_unsigned_integral_values()
+{
+  return {static_cast<T>(0),
+          static_cast<T>(1),
+          static_cast<T>(2),
+          static_cast<T>(3),
+          static_cast<T>(cuda::std::numeric_limits<T>::max() - 1),
+          cuda::std::numeric_limits<T>::max()};
+}
+
+__host__ __device__ bool tests()
+{
+  for (bool b : {false, true})
+  {
+    test_roundtrip_through_nested_T(b);
+    test_roundtrip_through_buffer(b);
+    test_roundtrip_through<char>(b);
+  }
+
+  for (char c : {'\0', 'a', 'b', 'c', 'd'})
+  {
+    test_roundtrip_through_nested_T(c);
+    test_roundtrip_through_buffer(c);
+  }
+
+  // Fundamental signed integer types
+  for (signed char i : generate_signed_integral_values<signed char>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  for (short i : generate_signed_integral_values<short>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  for (int i : generate_signed_integral_values<int>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<float>(i);
+  }
+
+  for (long i : generate_signed_integral_values<long>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  for (long long i : generate_signed_integral_values<long long>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<double>(i);
+  }
+
+  // Fundamental unsigned integer types
+  for (unsigned char i : generate_unsigned_integral_values<unsigned char>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  for (unsigned short i : generate_unsigned_integral_values<unsigned short>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  for (unsigned int i : generate_unsigned_integral_values<unsigned int>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<float>(i);
+  }
+
+  for (unsigned long i : generate_unsigned_integral_values<unsigned long>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  for (unsigned long long i : generate_unsigned_integral_values<unsigned long long>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<double>(i);
+  }
+
+  // Fixed width signed integer types
+  for (cuda::std::int32_t i : generate_signed_integral_values<cuda::std::int32_t>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<int>(i);
+    test_roundtrip_through<cuda::std::uint32_t>(i);
+    test_roundtrip_through<float>(i);
+  }
+
+  for (cuda::std::int64_t i : generate_signed_integral_values<cuda::std::int64_t>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<long long>(i);
+    test_roundtrip_through<cuda::std::uint64_t>(i);
+    test_roundtrip_through<double>(i);
+  }
+
+  // Fixed width unsigned integer types
+  for (cuda::std::uint32_t i : generate_unsigned_integral_values<cuda::std::uint32_t>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<int>(i);
+    test_roundtrip_through<cuda::std::int32_t>(i);
+    test_roundtrip_through<float>(i);
+  }
+
+  for (cuda::std::uint64_t i : generate_unsigned_integral_values<cuda::std::uint64_t>())
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<long long>(i);
+    test_roundtrip_through<cuda::std::int64_t>(i);
+    test_roundtrip_through<double>(i);
+  }
+
+  // Floating point types
+  for (float i :
+       {0.0f,
+        1.0f,
+        -1.0f,
+        10.0f,
+        -10.0f,
+        1e10f,
+        1e-10f,
+        1e20f,
+        1e-20f,
+        2.71828f,
+        3.14159f,
+#if !defined(TEST_COMPILER_NVRTC) && !defined(TEST_COMPILER_CLANG_CUDA)
+        cuda::std::nanf(""),
+#endif // !TEST_COMPILER_NVRTC && !TEST_COMPILER_CLANG_CUDA
+        __builtin_nanf("0x55550001"), // NaN with a payload
+        cuda::std::numeric_limits<float>::signaling_NaN(),
+        cuda::std::numeric_limits<float>::quiet_NaN(),
+        cuda::std::numeric_limits<float>::infinity()})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<int>(i);
+  }
+
+  for (double i :
+       {0.0,
+        1.0,
+        -1.0,
+        10.0,
+        -10.0,
+        1e10,
+        1e-10,
+        1e100,
+        1e-100,
+        2.718281828459045,
+        3.141592653589793238462643383279502884197169399375105820974944,
+#if !defined(TEST_COMPILER_NVRTC) && !defined(TEST_COMPILER_CLANG_CUDA)
+        cuda::std::nan(""),
+#endif // !TEST_COMPILER_NVRTC && !TEST_COMPILER_CLANG_CUDA
+        cuda::std::numeric_limits<double>::signaling_NaN(),
+        cuda::std::numeric_limits<double>::quiet_NaN(),
+        cuda::std::numeric_limits<double>::infinity()})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+    test_roundtrip_through<long long>(i);
+  }
+
+  // Test pointers
+  {
+    {
+      int obj = 3;
+      void* p = &obj;
+      test_roundtrip_through_nested_T(p);
+      test_roundtrip_through_buffer(p);
+      test_roundtrip_through<void*>(p);
+      test_roundtrip_through<char*>(p);
+      test_roundtrip_through<int*>(p);
+    }
+    {
+      int obj = 3;
+      int* p  = &obj;
+      test_roundtrip_through_nested_T(p);
+      test_roundtrip_through_buffer(p);
+      test_roundtrip_through<int*>(p);
+      test_roundtrip_through<char*>(p);
+      test_roundtrip_through<void*>(p);
+    }
+  }
+
+  return true;
+}
+
+#if defined(_LIBCUDACXX_BIT_CAST)
+__host__ __device__ constexpr bool basic_constexpr_test()
+{
+#  if TEST_STD_VER >= 2014
+  struct Nested
+  {
+    char buffer[sizeof(int)];
+  };
+  int from      = 3;
+  Nested middle = cuda::std::bit_cast<Nested>(from);
+  int to        = cuda::std::bit_cast<int>(middle);
+  return (from == to);
+#  else // ^^^ C++14 ^^^ / vvv C++11 vvv
+  // only do a sanity check in C++11
+  return (3 == cuda::std::bit_cast<unsigned>(3u));
+#  endif // TEST_STD_VER >= 2014
+}
+#endif // _LIBCUDACXX_BIT_CAST
+
+int main(int, char**)
+{
+  tests();
+#if defined(_LIBCUDACXX_BIT_CAST)
+  static_assert(basic_constexpr_test(), "");
+#endif // _LIBCUDACXX_BIT_CAST
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocate_at_least.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocate_at_least.pass.cpp
index 20675a86ef..5bc9eb7b93 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocate_at_least.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocate_at_least.pass.cpp
@@ -33,7 +33,7 @@ struct no_allocate_at_least
   {
     return &t;
   }
-  constexpr void deallocate(T*, cuda::std::size_t) {}
+  constexpr void deallocate(T*, cuda::std::size_t) noexcept {}
 };
 
 template <class T>
@@ -47,7 +47,7 @@ struct has_allocate_at_least
   {
     return &t1;
   }
-  constexpr void deallocate(T*, cuda::std::size_t) {}
+  constexpr void deallocate(T*, cuda::std::size_t) noexcept {}
   constexpr cuda::std::allocation_result<T*> allocate_at_least(cuda::std::size_t)
   {
     return {&t2, 2};
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/construct.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/construct.pass.cpp
index 700263a4ce..6d09f14cb2 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/construct.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/construct.pass.cpp
@@ -45,7 +45,7 @@ struct Alloc
     return &storage;
   }
 
-  __host__ __device__ TEST_CONSTEXPR_CXX20 void deallocate(value_type*, cuda::std::size_t) {}
+  __host__ __device__ TEST_CONSTEXPR_CXX20 void deallocate(value_type*, cuda::std::size_t) noexcept {}
 
   value_type storage;
 };
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/deallocate.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/deallocate.pass.cpp
index 860c37888e..7a53cdb1f1 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/deallocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/deallocate.pass.cpp
@@ -12,7 +12,7 @@
 // template <class Alloc>
 // struct allocator_traits
 // {
-//     static constexpr void deallocate(allocator_type& a, pointer p, size_type n);
+//     static constexpr void deallocate(allocator_type& a, pointer p, size_type n) noexcept;
 //     ...
 // };
 
@@ -32,7 +32,7 @@ struct A
       : called_(called)
   {}
 
-  __host__ __device__ TEST_CONSTEXPR_CXX20 void deallocate(value_type* p, cuda::std::size_t n)
+  __host__ __device__ TEST_CONSTEXPR_CXX20 void deallocate(value_type* p, cuda::std::size_t n) noexcept
   {
     assert(p == &storage);
     assert(n == 10);
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/destroy.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/destroy.pass.cpp
index 87d40b345c..e7642633dc 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/destroy.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/allocator.traits/allocator.traits.members/destroy.pass.cpp
@@ -36,7 +36,7 @@ struct NoDestroy
     return cuda::std::allocator<T>().allocate(n);
   }
 
-  __host__ __device__ TEST_CONSTEXPR_CXX20 void deallocate(T* p, cuda::std::size_t n)
+  __host__ __device__ TEST_CONSTEXPR_CXX20 void deallocate(T* p, cuda::std::size_t n) noexcept
   {
     return cuda::std::allocator<T>().deallocate(p, n);
   }
@@ -58,7 +58,7 @@ struct CountDestroy
     return &storage;
   }
 
-  __host__ __device__ TEST_CONSTEXPR_CXX20 void deallocate(T* p, cuda::std::size_t n) {}
+  __host__ __device__ TEST_CONSTEXPR_CXX20 void deallocate(T* p, cuda::std::size_t n) noexcept {}
 
   template <class U, class... Args>
   __host__ __device__ TEST_CONSTEXPR_CXX20 void construct(U* p, Args&&... args)
@@ -68,7 +68,7 @@ struct CountDestroy
   }
 
   template <class U>
-  __host__ __device__ TEST_CONSTEXPR_CXX20 void destroy(U* p)
+  __host__ __device__ TEST_CONSTEXPR_CXX20 void destroy(U* p) noexcept
   {
     assert(p == nullptr);
     ++*counter_;
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/temporary.buffer/overaligned.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
index dbc4d2cf84..702ea54acf 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
@@ -8,13 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03
-
-// Aligned allocations are not supported on macOS < 10.13
-// Note: use 'unsupported' instead of 'xfail' to ensure
-// we won't pass prior to c++17.
-// UNSUPPORTED: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-
 #define _LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS
 
 // <memory>
diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary.prop.query/alignment_of.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary.prop.query/alignment_of.pass.cpp
index 958377db6f..3f62e61338 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary.prop.query/alignment_of.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary.prop.query/alignment_of.pass.cpp
@@ -51,11 +51,7 @@ int main(int, char**)
   // we should expect. In most cases it should be 8. But in i386 builds
   // with Clang >= 8 or GCC >= 8 the value is '4'.
   test_alignment_of<double, TEST_ALIGNOF(double)>();
-#if (defined(__ppc__) && !defined(__ppc64__) && !defined(__PPC64__))
-  test_alignment_of<bool, 4>(); // 32-bit PPC has four byte bool
-#else
   test_alignment_of<bool, 1>();
-#endif
   test_alignment_of<unsigned, 4>();
 
   return 0;
diff --git a/libcudacxx/test/support/allocators.h b/libcudacxx/test/support/allocators.h
index fe56caf56a..75ec320d08 100644
--- a/libcudacxx/test/support/allocators.h
+++ b/libcudacxx/test/support/allocators.h
@@ -78,7 +78,7 @@ class A1
     return (T*) n;
   }
 
-  __device__ __host__ void deallocate(T* p, cuda::std::size_t n)
+  __device__ __host__ void deallocate(T* p, cuda::std::size_t n) noexcept
   {
     deallocate_called() = cuda::std::pair<T*, cuda::std::size_t>(p, n);
   }
@@ -217,7 +217,7 @@ class A3
   }
 
   template <class U>
-  __device__ __host__ void destroy(U* p)
+  __device__ __host__ void destroy(U* p) noexcept
   {
     p->~U();
     destroy_called() = true;
diff --git a/libcudacxx/test/support/container_test_types.h b/libcudacxx/test/support/container_test_types.h
index 2020c6d8a7..7145021d32 100644
--- a/libcudacxx/test/support/container_test_types.h
+++ b/libcudacxx/test/support/container_test_types.h
@@ -350,7 +350,7 @@ class ContainerTestAllocator
     return static_cast<T*>(::operator new(n * sizeof(T)));
   }
 
-  void deallocate(T* p, std::size_t)
+  void deallocate(T* p, std::size_t) noexcept
   {
     return ::operator delete(static_cast<void*>(p));
   }
@@ -367,7 +367,7 @@ class ContainerTestAllocator
   }
 
   template <class Up>
-  void destroy(Up* p)
+  void destroy(Up* p) noexcept
   {
     static_assert((std::is_same<Up, AllowConstructT>::value), "Only allowed to destroy Up");
     {
diff --git a/libcudacxx/test/support/controlled_allocators.h b/libcudacxx/test/support/controlled_allocators.h
index d2934963b8..1d98f972ce 100644
--- a/libcudacxx/test/support/controlled_allocators.h
+++ b/libcudacxx/test/support/controlled_allocators.h
@@ -127,7 +127,7 @@ struct AllocController
   }
 
   template <class Alloc, class Tp>
-  void countDestroy(Alloc const&, Tp* p)
+  void countDestroy(Alloc const&, Tp* p) noexcept
   {
     ++destroy_called;
     last_destroy_alloc   = &makeTypeID<Alloc>();
@@ -271,7 +271,7 @@ class CountingAllocator
     return static_cast<T*>(ret);
   }
 
-  void deallocate(T* p, std::size_t n)
+  void deallocate(T* p, std::size_t n) noexcept
   {
     void* vp = static_cast<void*>(p);
     P->countDealloc(vp, n * sizeof(T), alignof(T));
@@ -286,7 +286,7 @@ class CountingAllocator
   }
 
   template <class U>
-  void destroy(U* p)
+  void destroy(U* p) noexcept
   {
     p->~U();
     P->countDestroy(*this, p);
@@ -422,7 +422,7 @@ class MinAlignedAllocator
     return ((T*) unaligned_ptr);
   }
 
-  void deallocate(T* p, std::size_t n)
+  void deallocate(T* p, std::size_t n) noexcept
   {
     assert(is_min_aligned(p));
 
@@ -442,7 +442,7 @@ class MinAlignedAllocator
   }
 
   template <class U>
-  void destroy(U* p)
+  void destroy(U* p) noexcept
   {
     p->~U();
     P->countDestroy(*this, p);
@@ -539,7 +539,7 @@ class NullAllocator
     return nullptr;
   }
 
-  void deallocate(T* p, std::size_t n)
+  void deallocate(T* p, std::size_t n) noexcept
   {
     void* vp = static_cast<void*>(p);
     P->countDealloc(vp, n * sizeof(T), alignof(T));
diff --git a/libcudacxx/test/support/min_allocator.h b/libcudacxx/test/support/min_allocator.h
index 0f5d1d60fb..194e0e55ae 100644
--- a/libcudacxx/test/support/min_allocator.h
+++ b/libcudacxx/test/support/min_allocator.h
@@ -34,7 +34,7 @@ class bare_allocator
     return static_cast<T*>(::operator new(n * sizeof(T)));
   }
 
-  __host__ __device__ void deallocate(T* p, cuda::std::size_t)
+  __host__ __device__ void deallocate(T* p, cuda::std::size_t) noexcept
   {
     return ::operator delete(static_cast<void*>(p));
   }
@@ -76,7 +76,7 @@ class no_default_allocator
     return static_cast<T*>(::operator new(n * sizeof(T)));
   }
 
-  __host__ __device__ void deallocate(T* p, cuda::std::size_t)
+  __host__ __device__ void deallocate(T* p, cuda::std::size_t) noexcept
   {
     return ::operator delete(static_cast<void*>(p));
   }
@@ -137,7 +137,7 @@ class malloc_allocator : public malloc_allocator_base
     return static_cast<T*>(cuda::std::malloc(nbytes));
   }
 
-  __host__ __device__ void deallocate(T* p, cuda::std::size_t n)
+  __host__ __device__ void deallocate(T* p, cuda::std::size_t n) noexcept
   {
     const size_t nbytes = n * sizeof(T);
     ++malloc_allocator_base_dealloc_count;
@@ -585,7 +585,7 @@ class min_allocator
     return pointer(static_cast<T*>(::operator new(n * sizeof(T))));
   }
 
-  __host__ __device__ void deallocate(pointer p, cuda::std::ptrdiff_t)
+  __host__ __device__ void deallocate(pointer p, cuda::std::ptrdiff_t) noexcept
   {
     return ::operator delete(p.ptr_);
   }
@@ -617,7 +617,7 @@ class explicit_allocator
     return static_cast<T*>(::operator new(n * sizeof(T)));
   }
 
-  __host__ __device__ void deallocate(T* p, cuda::std::size_t)
+  __host__ __device__ void deallocate(T* p, cuda::std::size_t) noexcept
   {
     return ::operator delete(static_cast<void*>(p));
   }
diff --git a/libcudacxx/test/support/test_allocator.h b/libcudacxx/test/support/test_allocator.h
index bd9392a364..081e5eb56b 100644
--- a/libcudacxx/test/support/test_allocator.h
+++ b/libcudacxx/test/support/test_allocator.h
@@ -163,7 +163,7 @@ class test_allocator : public test_alloc_base
     ++alloc_count;
     return (pointer)::operator new(n * sizeof(T));
   }
-  void deallocate(pointer p, size_type)
+  void deallocate(pointer p, size_type) noexcept
   {
     assert(data_ >= 0);
     --alloc_count;
@@ -178,7 +178,7 @@ class test_allocator : public test_alloc_base
   {
     ::new (static_cast<void*>(p)) T(std::forward<U>(val));
   }
-  void destroy(pointer p)
+  void destroy(pointer p) noexcept
   {
     p->~T();
   }
@@ -267,7 +267,7 @@ class non_default_test_allocator : public test_alloc_base
     ++alloc_count;
     return (pointer)::operator new(n * sizeof(T));
   }
-  void deallocate(pointer p, size_type)
+  void deallocate(pointer p, size_type) noexcept
   {
     assert(data_ >= 0);
     --alloc_count;
@@ -282,7 +282,7 @@ class non_default_test_allocator : public test_alloc_base
   {
     ::new (static_cast<void*>(p)) T(std::forward<U>(val));
   }
-  void destroy(pointer p)
+  void destroy(pointer p) noexcept
   {
     p->~T();
   }
@@ -386,7 +386,7 @@ class other_allocator
   {
     return (T*) ::operator new(n * sizeof(T));
   }
-  void deallocate(T* p, std::size_t)
+  void deallocate(T* p, std::size_t) noexcept
   {
     ::operator delete((void*) p);
   }
@@ -473,7 +473,7 @@ class TaggingAllocator
     return std::allocator<T>{}.allocate(n);
   }
 
-  void deallocate(T* p, std::size_t n)
+  void deallocate(T* p, std::size_t n) noexcept
   {
     std::allocator<T>{}.deallocate(p, n);
   }
@@ -491,7 +491,7 @@ class TaggingAllocator
   }
 
   template <typename U, typename... Args>
-  void destroy(U* p)
+  void destroy(U* p) noexcept
   {
     p->~U();
   }
@@ -532,7 +532,7 @@ struct limited_alloc_handle
     return static_cast<T*>(last_alloc_);
   }
 
-  void deallocate(void* ptr, std::size_t N)
+  void deallocate(void* ptr, std::size_t N) noexcept
   {
     if (ptr == last_alloc_)
     {
@@ -588,7 +588,7 @@ class limited_allocator
   {
     return handle_->template allocate<T>(n);
   }
-  void deallocate(pointer p, size_type n)
+  void deallocate(pointer p, size_type n) noexcept
   {
     handle_->deallocate(p, n);
   }
diff --git a/libcudacxx/test/support/test_macros.h b/libcudacxx/test/support/test_macros.h
index 449f5fbbc2..b17ba94775 100644
--- a/libcudacxx/test/support/test_macros.h
+++ b/libcudacxx/test/support/test_macros.h
@@ -63,9 +63,7 @@
 #  define TEST_HAS_BUILTIN_IDENTIFIER(X) 0
 #endif
 
-#if defined(__INTEL_LLVM_COMPILER)
-#  define TEST_COMPILER_ICC_LLVM
-#elif defined(__INTEL_COMPILER)
+#if defined(__INTEL_COMPILER)
 #  define TEST_COMPILER_ICC
 #elif defined(__NVCOMPILER)
 #  define TEST_COMPILER_NVHPC
@@ -78,8 +76,6 @@
 #  define TEST_COMPILER_GCC
 #elif defined(_MSC_VER)
 #  define TEST_COMPILER_MSVC
-#elif defined(__IBMCPP__)
-#  define TEST_COMPILER_IBM
 #elif defined(__CUDACC_RTC__)
 #  define TEST_COMPILER_NVRTC
 #elif defined(__EDG__)
@@ -249,7 +245,7 @@
 #endif
 
 #ifndef TEST_HAS_NO_EXCEPTIONS
-#  if !defined(LIBCUDACXX_ENABLE_EXCEPTIONS) || (defined(_CCCL_COMPILER_MSVC) && _HAS_EXCEPTIONS == 0) \
+#  if (defined(_CCCL_COMPILER_MSVC) && _HAS_EXCEPTIONS == 0) \
     || (!defined(_CCCL_COMPILER_MSVC) && !__EXCEPTIONS) // Catches all non msvc based compilers
 #    define TEST_HAS_NO_EXCEPTIONS
 #  endif
diff --git a/libcudacxx/test/support/test_memory_resource.h b/libcudacxx/test/support/test_memory_resource.h
index f2d361d1b0..9970692c13 100644
--- a/libcudacxx/test/support/test_memory_resource.h
+++ b/libcudacxx/test/support/test_memory_resource.h
@@ -143,7 +143,7 @@ struct NullProvider
   {
     return nullptr;
   }
-  void deallocate(void*, size_t, size_t) {}
+  void deallocate(void*, size_t, size_t) noexcept {}
   void reset() {}
 
 private:
@@ -157,7 +157,7 @@ struct NewDeleteProvider
   {
     return ::operator new(s);
   }
-  void deallocate(void* p, size_t, size_t)
+  void deallocate(void* p, size_t, size_t) noexcept
   {
     ::operator delete(p);
   }
@@ -191,7 +191,7 @@ struct BufferProvider
     return ret;
   }
 
-  void deallocate(void*, size_t, size_t) {}
+  void deallocate(void*, size_t, size_t) noexcept {}
 
   void reset()
   {
diff --git a/python/cuda/.gitignore b/python/cuda_cooperative/.gitignore
similarity index 100%
rename from python/cuda/.gitignore
rename to python/cuda_cooperative/.gitignore
diff --git a/python/cuda/MANIFEST.in b/python/cuda_cooperative/MANIFEST.in
similarity index 100%
rename from python/cuda/MANIFEST.in
rename to python/cuda_cooperative/MANIFEST.in
diff --git a/python/cuda/README.md b/python/cuda_cooperative/README.md
similarity index 89%
rename from python/cuda/README.md
rename to python/cuda_cooperative/README.md
index e57f06e6b4..c202d1d6c1 100644
--- a/python/cuda/README.md
+++ b/python/cuda_cooperative/README.md
@@ -8,5 +8,5 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 
 ```bash
 pip3 install -e .[test]
-pytest -v ./tests/device/
+pytest -v ./tests/
 ```
diff --git a/python/cuda/cuda/cooperative/__init__.py b/python/cuda_cooperative/cuda/cooperative/__init__.py
similarity index 100%
rename from python/cuda/cuda/cooperative/__init__.py
rename to python/cuda_cooperative/cuda/cooperative/__init__.py
diff --git a/python/cuda/cuda/cooperative/_version.py b/python/cuda_cooperative/cuda/cooperative/_version.py
similarity index 89%
rename from python/cuda/cuda/cooperative/_version.py
rename to python/cuda_cooperative/cuda/cooperative/_version.py
index aaab7ef4ad..d12cf81dc7 100644
--- a/python/cuda/cuda/cooperative/_version.py
+++ b/python/cuda_cooperative/cuda/cooperative/_version.py
@@ -4,4 +4,4 @@
 
 # This file is generated by ci/update_version.sh
 # Do not edit this file manually.
-__version__ = "0.1.2.6.0"
+__version__ = "0.1.2.7.0"
diff --git a/python/cuda/cuda/cooperative/experimental/__init__.py b/python/cuda_cooperative/cuda/cooperative/experimental/__init__.py
similarity index 100%
rename from python/cuda/cuda/cooperative/experimental/__init__.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/__init__.py
diff --git a/python/cuda/cuda/cooperative/experimental/_caching.py b/python/cuda_cooperative/cuda/cooperative/experimental/_caching.py
similarity index 100%
rename from python/cuda/cuda/cooperative/experimental/_caching.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/_caching.py
diff --git a/python/cuda/cuda/cooperative/experimental/_common.py b/python/cuda_cooperative/cuda/cooperative/experimental/_common.py
similarity index 100%
rename from python/cuda/cuda/cooperative/experimental/_common.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/_common.py
diff --git a/python/cuda/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
similarity index 100%
rename from python/cuda/cuda/cooperative/experimental/_nvrtc.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
diff --git a/python/cuda/cuda/cooperative/experimental/_types.py b/python/cuda_cooperative/cuda/cooperative/experimental/_types.py
similarity index 100%
rename from python/cuda/cuda/cooperative/experimental/_types.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/_types.py
diff --git a/python/cuda/cuda/cooperative/experimental/block/__init__.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/__init__.py
similarity index 100%
rename from python/cuda/cuda/cooperative/experimental/block/__init__.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/block/__init__.py
diff --git a/python/cuda/cuda/cooperative/experimental/block/_block_merge_sort.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_merge_sort.py
similarity index 90%
rename from python/cuda/cuda/cooperative/experimental/block/_block_merge_sort.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/block/_block_merge_sort.py
index 9a95ab991b..054a5f0495 100644
--- a/python/cuda/cuda/cooperative/experimental/block/_block_merge_sort.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_merge_sort.py
@@ -15,7 +15,7 @@ def merge_sort_keys(dtype, threads_in_block, items_per_thread, compare_op, metho
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -23,7 +23,7 @@ def merge_sort_keys(dtype, threads_in_block, items_per_thread, compare_op, metho
 
         Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin merge-sort
@@ -51,7 +51,8 @@ def merge_sort_keys(dtype, threads_in_block, items_per_thread, compare_op, metho
                           TemplateParameter('BLOCK_DIM_X'),
                           TemplateParameter('ITEMS_PER_THREAD')],
                          [[Pointer(numba.uint8),
-                           DependentArray(Dependency('KeyT'), Dependency('ITEMS_PER_THREAD')),
+                           DependentArray(Dependency('KeyT'),
+                                          Dependency('ITEMS_PER_THREAD')),
                            DependentOperator(Constant(numba.int8), [Dependency('KeyT'), Dependency('KeyT')], Dependency('Op'))]],
                          type_definitions=[numba_type_to_wrapper(dtype, methods=methods)])
     specialization = template.specialize({'KeyT': dtype,
diff --git a/python/cuda/cuda/cooperative/experimental/block/_block_radix_sort.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_radix_sort.py
similarity index 89%
rename from python/cuda/cuda/cooperative/experimental/block/_block_radix_sort.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/block/_block_radix_sort.py
index c1d52a5dec..a8f3785298 100644
--- a/python/cuda/cuda/cooperative/experimental/block/_block_radix_sort.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_radix_sort.py
@@ -15,7 +15,7 @@ def radix_sort_keys(dtype, threads_in_block, items_per_thread):
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -23,7 +23,7 @@ def radix_sort_keys(dtype, threads_in_block, items_per_thread):
 
         Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin radix-sort
@@ -50,7 +50,8 @@ def radix_sort_keys(dtype, threads_in_block, items_per_thread):
                           TemplateParameter('BLOCK_DIM_X'),
                           TemplateParameter('ITEMS_PER_THREAD')],
                          [[Pointer(numba.uint8), DependentArray(Dependency('KeyT'), Dependency('ITEMS_PER_THREAD'))],
-                          [Pointer(numba.uint8), DependentArray(Dependency('KeyT'), Dependency('ITEMS_PER_THREAD')), Value(numba.int32), Value(numba.int32)]
+                          [Pointer(numba.uint8), DependentArray(Dependency('KeyT'), Dependency(
+                              'ITEMS_PER_THREAD')), Value(numba.int32), Value(numba.int32)]
                           ])
     specialization = template.specialize({'KeyT': dtype,
                                           'BLOCK_DIM_X': threads_in_block,
@@ -68,7 +69,7 @@ def radix_sort_keys_descending(dtype, threads_in_block, items_per_thread):
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -76,7 +77,7 @@ def radix_sort_keys_descending(dtype, threads_in_block, items_per_thread):
 
         Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin radix-sort-descending
@@ -103,7 +104,8 @@ def radix_sort_keys_descending(dtype, threads_in_block, items_per_thread):
                           TemplateParameter('BLOCK_DIM_X'),
                           TemplateParameter('ITEMS_PER_THREAD')],
                          [[Pointer(numba.uint8), DependentArray(Dependency('KeyT'), Dependency('ITEMS_PER_THREAD'))],
-                          [Pointer(numba.uint8), DependentArray(Dependency('KeyT'), Dependency('ITEMS_PER_THREAD')), Value(numba.int32), Value(numba.int32)]
+                          [Pointer(numba.uint8), DependentArray(Dependency('KeyT'), Dependency(
+                              'ITEMS_PER_THREAD')), Value(numba.int32), Value(numba.int32)]
                           ])
     specialization = template.specialize({'KeyT': dtype,
                                           'BLOCK_DIM_X': threads_in_block,
diff --git a/python/cuda/cuda/cooperative/experimental/block/_block_reduce.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_reduce.py
similarity index 91%
rename from python/cuda/cuda/cooperative/experimental/block/_block_reduce.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/block/_block_reduce.py
index 0f50bdc135..072358826e 100644
--- a/python/cuda/cuda/cooperative/experimental/block/_block_reduce.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_reduce.py
@@ -17,7 +17,7 @@ def reduce(dtype, threads_in_block, binary_op, methods=None):
         The code snippet below illustrates a max reduction of 128 integer items that
         are partitioned across 128 threads.
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -25,7 +25,7 @@ def reduce(dtype, threads_in_block, binary_op, methods=None):
 
         Below is the code snippet that demonstrates the usage of the ``reduce`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin reduce
@@ -51,7 +51,8 @@ def reduce(dtype, threads_in_block, binary_op, methods=None):
                           TemplateParameter('BLOCK_DIM_X')],
                          [[Pointer(numba.uint8),
                            DependentReference(Dependency('T')),
-                           DependentOperator(Dependency('T'), [Dependency('T'), Dependency('T')], Dependency('Op')),
+                           DependentOperator(Dependency('T'), [Dependency(
+                               'T'), Dependency('T')], Dependency('Op')),
                            DependentReference(Dependency('T'), True)]],
                          type_definitions=[numba_type_to_wrapper(dtype, methods=methods)])
     specialization = template.specialize({'T': dtype,
@@ -74,7 +75,7 @@ def sum(dtype, threads_in_block):
         The code snippet below illustrates a reduction of 128 integer items that
         are partitioned across 128 threads.
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -82,7 +83,7 @@ def sum(dtype, threads_in_block):
 
         Below is the code snippet that demonstrates the usage of the ``reduce`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin sum
diff --git a/python/cuda/cuda/cooperative/experimental/block/_block_scan.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_scan.py
similarity index 84%
rename from python/cuda/cuda/cooperative/experimental/block/_block_scan.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/block/_block_scan.py
index 3bf70a5cbb..59d170163b 100644
--- a/python/cuda/cuda/cooperative/experimental/block/_block_scan.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_scan.py
@@ -17,7 +17,7 @@ def exclusive_sum(dtype, threads_in_block, items_per_thread, prefix_op=None):
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive items.
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_scan_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_scan_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -25,7 +25,7 @@ def exclusive_sum(dtype, threads_in_block, items_per_thread, prefix_op=None):
 
         Below is the code snippet that demonstrates the usage of the ``exclusive_sum`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_block_scan_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_scan_api.py
             :language: python
             :dedent:
             :start-after: example-begin exclusive-sum
@@ -51,11 +51,14 @@ def exclusive_sum(dtype, threads_in_block, items_per_thread, prefix_op=None):
                          [TemplateParameter('T'),
                           TemplateParameter('BLOCK_DIM_X')],
                          [[Pointer(numba.uint8),
-                           DependentArray(Dependency('T'), Dependency('ITEMS_PER_THREAD')),
-                           DependentArray(Dependency('T'), Dependency('ITEMS_PER_THREAD')),
+                           DependentArray(Dependency(
+                               'T'), Dependency('ITEMS_PER_THREAD')),
+                           DependentArray(Dependency(
+                               'T'), Dependency('ITEMS_PER_THREAD')),
                            DependentOperator(Dependency('T'), [Dependency('T')], Dependency('PrefixOp'))],
                           [Pointer(numba.uint8),
-                           DependentArray(Dependency('T'), Dependency('ITEMS_PER_THREAD')),
+                           DependentArray(Dependency(
+                               'T'), Dependency('ITEMS_PER_THREAD')),
                            DependentArray(Dependency('T'), Dependency('ITEMS_PER_THREAD'))]])
     specialization = template.specialize({'T': dtype,
                                           'BLOCK_DIM_X': threads_in_block,
diff --git a/python/cuda/cuda/cooperative/experimental/warp/__init__.py b/python/cuda_cooperative/cuda/cooperative/experimental/warp/__init__.py
similarity index 100%
rename from python/cuda/cuda/cooperative/experimental/warp/__init__.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/warp/__init__.py
diff --git a/python/cuda/cuda/cooperative/experimental/warp/_warp_merge_sort.py b/python/cuda_cooperative/cuda/cooperative/experimental/warp/_warp_merge_sort.py
similarity index 90%
rename from python/cuda/cuda/cooperative/experimental/warp/_warp_merge_sort.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/warp/_warp_merge_sort.py
index 96f591c032..08822d18ae 100644
--- a/python/cuda/cuda/cooperative/experimental/warp/_warp_merge_sort.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/warp/_warp_merge_sort.py
@@ -15,7 +15,7 @@ def merge_sort_keys(dtype, items_per_thread, compare_op, threads_in_warp=32, met
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across a warp of 32 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
 
-        .. literalinclude:: ../../python/cuda/tests/test_warp_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_warp_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -23,7 +23,7 @@ def merge_sort_keys(dtype, items_per_thread, compare_op, threads_in_warp=32, met
 
         Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_warp_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_warp_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin merge-sort
@@ -51,7 +51,8 @@ def merge_sort_keys(dtype, items_per_thread, compare_op, threads_in_warp=32, met
                           TemplateParameter('ITEMS_PER_THREAD'),
                           TemplateParameter('VIRTUAL_WARP_THREADS')],
                          [[Pointer(numba.uint8),
-                           DependentArray(Dependency('KeyT'), Dependency('ITEMS_PER_THREAD')),
+                           DependentArray(Dependency('KeyT'),
+                                          Dependency('ITEMS_PER_THREAD')),
                            DependentOperator(Constant(numba.int8), [Dependency('KeyT'), Dependency('KeyT')], Dependency('Op'))]],
                          type_definitions=[numba_type_to_wrapper(dtype, methods=methods)])
     specialization = template.specialize({'KeyT': dtype,
diff --git a/python/cuda/cuda/cooperative/experimental/warp/_warp_reduce.py b/python/cuda_cooperative/cuda/cooperative/experimental/warp/_warp_reduce.py
similarity index 91%
rename from python/cuda/cuda/cooperative/experimental/warp/_warp_reduce.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/warp/_warp_reduce.py
index 44eae0588e..eb7de0d28b 100644
--- a/python/cuda/cuda/cooperative/experimental/warp/_warp_reduce.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/warp/_warp_reduce.py
@@ -18,7 +18,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
         The code snippet below illustrates a max reduction of 32 integer items that
         are partitioned across a warp of threads.
 
-        .. literalinclude:: ../../python/cuda/tests/test_warp_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_warp_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -26,7 +26,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
 
         Below is the code snippet that demonstrates the usage of the ``reduce`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_warp_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_warp_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin reduce
@@ -52,7 +52,8 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
                           TemplateParameter('VIRTUAL_WARP_THREADS')],
                          [[Pointer(numba.uint8),
                            DependentReference(Dependency('T')),
-                           DependentOperator(Dependency('T'), [Dependency('T'), Dependency('T')], Dependency('Op')),
+                           DependentOperator(Dependency('T'), [Dependency(
+                               'T'), Dependency('T')], Dependency('Op')),
                            DependentReference(Dependency('T'), True)]],
                          type_definitions=[numba_type_to_wrapper(dtype, methods=methods)])
     specialization = template.specialize({'T': dtype,
@@ -75,7 +76,7 @@ def sum(dtype, threads_in_warp=32):
         The code snippet below illustrates a reduction of 32 integer items that
         are partitioned across a warp of threads.
 
-        .. literalinclude:: ../../python/cuda/tests/test_warp_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_warp_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -83,7 +84,7 @@ def sum(dtype, threads_in_warp=32):
 
         Below is the code snippet that demonstrates the usage of the ``reduce`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_warp_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_warp_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin sum
diff --git a/python/cuda/cuda/cooperative/experimental/warp/_warp_scan.py b/python/cuda_cooperative/cuda/cooperative/experimental/warp/_warp_scan.py
similarity index 93%
rename from python/cuda/cuda/cooperative/experimental/warp/_warp_scan.py
rename to python/cuda_cooperative/cuda/cooperative/experimental/warp/_warp_scan.py
index 689ae3aee4..b2c97f216b 100644
--- a/python/cuda/cuda/cooperative/experimental/warp/_warp_scan.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/warp/_warp_scan.py
@@ -14,7 +14,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
     Example:
         The code snippet below illustrates an exclusive prefix sum of 32 integer items:
 
-        .. literalinclude:: ../../python/cuda/tests/test_warp_scan_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_warp_scan_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -22,7 +22,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
 
         Below is the code snippet that demonstrates the usage of the ``exclusive_sum`` API:
 
-        .. literalinclude:: ../../python/cuda/tests/test_warp_scan_api.py
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_warp_scan_api.py
             :language: python
             :dedent:
             :start-after: example-begin exclusive-sum
diff --git a/python/cuda/pyproject.toml b/python/cuda_cooperative/pyproject.toml
similarity index 100%
rename from python/cuda/pyproject.toml
rename to python/cuda_cooperative/pyproject.toml
diff --git a/python/cuda/setup.py b/python/cuda_cooperative/setup.py
similarity index 100%
rename from python/cuda/setup.py
rename to python/cuda_cooperative/setup.py
diff --git a/python/cuda/tests/helpers.py b/python/cuda_cooperative/tests/helpers.py
similarity index 100%
rename from python/cuda/tests/helpers.py
rename to python/cuda_cooperative/tests/helpers.py
diff --git a/python/cuda/tests/test_block_merge_sort.py b/python/cuda_cooperative/tests/test_block_merge_sort.py
similarity index 92%
rename from python/cuda/tests/test_block_merge_sort.py
rename to python/cuda_cooperative/tests/test_block_merge_sort.py
index 9451a75ff2..e78a3ead02 100644
--- a/python/cuda/tests/test_block_merge_sort.py
+++ b/python/cuda_cooperative/tests/test_block_merge_sort.py
@@ -27,7 +27,8 @@ def op(a, b):
     @cuda.jit(link=block_merge_sort.files)
     def kernel(input, output):
         tid = cuda.threadIdx.x
-        temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
+        temp_storage = cuda.shared.array(
+            shape=temp_storage_bytes, dtype='uint8')
         thread_data = cuda.local.array(shape=items_per_thread, dtype=dtype)
         for i in range(items_per_thread):
             thread_data[i] = input[tid * items_per_thread + i]
@@ -69,7 +70,8 @@ def op(a, b):
     @cuda.jit(link=block_merge_sort.files)
     def kernel(input, output):
         tid = cuda.threadIdx.x
-        temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
+        temp_storage = cuda.shared.array(
+            shape=temp_storage_bytes, dtype='uint8')
         thread_data = cuda.local.array(shape=items_per_thread, dtype=dtype)
         for i in range(items_per_thread):
             thread_data[i] = input[tid * items_per_thread + i]
@@ -112,7 +114,8 @@ def op(a, b):
     @cuda.jit(link=block_merge_sort.files)
     def kernel(input, output):
         tid = cuda.threadIdx.x
-        temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
+        temp_storage = cuda.shared.array(
+            shape=temp_storage_bytes, dtype='uint8')
         thread_data = cuda.local.array(shape=items_per_thread, dtype=dtype)
         for i in range(items_per_thread):
             thread_data[i] = input[tid * items_per_thread + i]
@@ -122,7 +125,8 @@ def kernel(input, output):
 
     dtype = np.complex128
     items_per_tile = threads_in_block * items_per_thread
-    input = np.random.random(items_per_tile) + 1j * np.random.random(items_per_tile)
+    input = np.random.random(items_per_tile) + 1j * \
+        np.random.random(items_per_tile)
     input = input.astype(dtype)
     d_input = cuda.to_device(input)
     d_output = cuda.device_array(items_per_tile, dtype=dtype)
diff --git a/python/cuda/tests/test_block_merge_sort_api.py b/python/cuda_cooperative/tests/test_block_merge_sort_api.py
similarity index 100%
rename from python/cuda/tests/test_block_merge_sort_api.py
rename to python/cuda_cooperative/tests/test_block_merge_sort_api.py
diff --git a/python/cuda/tests/test_block_radix_sort.py b/python/cuda_cooperative/tests/test_block_radix_sort.py
similarity index 100%
rename from python/cuda/tests/test_block_radix_sort.py
rename to python/cuda_cooperative/tests/test_block_radix_sort.py
diff --git a/python/cuda/tests/test_block_radix_sort_api.py b/python/cuda_cooperative/tests/test_block_radix_sort_api.py
similarity index 100%
rename from python/cuda/tests/test_block_radix_sort_api.py
rename to python/cuda_cooperative/tests/test_block_radix_sort_api.py
diff --git a/python/cuda/tests/test_block_reduce.py b/python/cuda_cooperative/tests/test_block_reduce.py
similarity index 100%
rename from python/cuda/tests/test_block_reduce.py
rename to python/cuda_cooperative/tests/test_block_reduce.py
diff --git a/python/cuda/tests/test_block_reduce_api.py b/python/cuda_cooperative/tests/test_block_reduce_api.py
similarity index 100%
rename from python/cuda/tests/test_block_reduce_api.py
rename to python/cuda_cooperative/tests/test_block_reduce_api.py
diff --git a/python/cuda/tests/test_block_scan.py b/python/cuda_cooperative/tests/test_block_scan.py
similarity index 100%
rename from python/cuda/tests/test_block_scan.py
rename to python/cuda_cooperative/tests/test_block_scan.py
diff --git a/python/cuda/tests/test_block_scan_api.py b/python/cuda_cooperative/tests/test_block_scan_api.py
similarity index 100%
rename from python/cuda/tests/test_block_scan_api.py
rename to python/cuda_cooperative/tests/test_block_scan_api.py
diff --git a/python/cuda/tests/test_warp_merge_sort.py b/python/cuda_cooperative/tests/test_warp_merge_sort.py
similarity index 100%
rename from python/cuda/tests/test_warp_merge_sort.py
rename to python/cuda_cooperative/tests/test_warp_merge_sort.py
diff --git a/python/cuda/tests/test_warp_merge_sort_api.py b/python/cuda_cooperative/tests/test_warp_merge_sort_api.py
similarity index 100%
rename from python/cuda/tests/test_warp_merge_sort_api.py
rename to python/cuda_cooperative/tests/test_warp_merge_sort_api.py
diff --git a/python/cuda/tests/test_warp_reduce.py b/python/cuda_cooperative/tests/test_warp_reduce.py
similarity index 100%
rename from python/cuda/tests/test_warp_reduce.py
rename to python/cuda_cooperative/tests/test_warp_reduce.py
diff --git a/python/cuda/tests/test_warp_reduce_api.py b/python/cuda_cooperative/tests/test_warp_reduce_api.py
similarity index 100%
rename from python/cuda/tests/test_warp_reduce_api.py
rename to python/cuda_cooperative/tests/test_warp_reduce_api.py
diff --git a/python/cuda/tests/test_warp_scan.py b/python/cuda_cooperative/tests/test_warp_scan.py
similarity index 100%
rename from python/cuda/tests/test_warp_scan.py
rename to python/cuda_cooperative/tests/test_warp_scan.py
diff --git a/python/cuda/tests/test_warp_scan_api.py b/python/cuda_cooperative/tests/test_warp_scan_api.py
similarity index 100%
rename from python/cuda/tests/test_warp_scan_api.py
rename to python/cuda_cooperative/tests/test_warp_scan_api.py
diff --git a/python/cuda_parallel/.gitignore b/python/cuda_parallel/.gitignore
new file mode 100644
index 0000000000..8e0d030ff6
--- /dev/null
+++ b/python/cuda_parallel/.gitignore
@@ -0,0 +1,4 @@
+cuda/_include
+env
+*egg-info
+*so
diff --git a/python/cuda_parallel/MANIFEST.in b/python/cuda_parallel/MANIFEST.in
new file mode 100644
index 0000000000..848cbfe2e8
--- /dev/null
+++ b/python/cuda_parallel/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include cuda/_include *
diff --git a/python/cuda_parallel/README.md b/python/cuda_parallel/README.md
new file mode 100644
index 0000000000..98a3a3c92d
--- /dev/null
+++ b/python/cuda_parallel/README.md
@@ -0,0 +1,12 @@
+# `cuda.parallel`: Experimental CUDA Core Compute Library for Python
+
+## Documentation
+
+Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
+
+## Local development
+
+```bash
+pip3 install -e .[test]
+pytest -v ./tests/
+```
diff --git a/python/cuda_parallel/cuda/parallel/__init__.py b/python/cuda_parallel/cuda/parallel/__init__.py
new file mode 100644
index 0000000000..6bb31cc9b4
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import cuda.parallel.experimental
+from cuda.parallel._version import __version__
diff --git a/python/cuda_parallel/cuda/parallel/_version.py b/python/cuda_parallel/cuda/parallel/_version.py
new file mode 100644
index 0000000000..d12cf81dc7
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/_version.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This file is generated by ci/update_version.sh
+# Do not edit this file manually.
+__version__ = "0.1.2.7.0"
diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
new file mode 100644
index 0000000000..4a16fc1b67
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
@@ -0,0 +1,278 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import importlib
+import ctypes
+import shutil
+import numba
+import os
+
+from numba import cuda, types
+from numba.cuda.cudadrv import enums
+
+
+# Should match C++
+class _TypeEnum(ctypes.c_int):
+    INT8 = 0
+    INT16 = 1
+    INT32 = 2
+    INT64 = 3
+    UINT8 = 4
+    UINT16 = 5
+    UINT32 = 6
+    UINT64 = 7
+    FLOAT32 = 8
+    FLOAT64 = 9
+    STORAGE = 10
+
+
+# Should match C++
+class _CCCLOpKindEnum(ctypes.c_int):
+    STATELESS = 0
+    STATEFUL = 1
+
+
+# Should match C++
+class _CCCLIteratorKindEnum(ctypes.c_int):
+    POINTER = 0
+    ITERATOR = 1
+
+
+def _type_to_enum(numba_type):
+    mapping = {
+        types.int8: _TypeEnum.INT8,
+        types.int16: _TypeEnum.INT16,
+        types.int32: _TypeEnum.INT32,
+        types.int64: _TypeEnum.INT64,
+        types.uint8: _TypeEnum.UINT8,
+        types.uint16: _TypeEnum.UINT16,
+        types.uint32: _TypeEnum.UINT32,
+        types.uint64: _TypeEnum.UINT64,
+        types.float32: _TypeEnum.FLOAT32,
+        types.float64: _TypeEnum.FLOAT64,
+    }
+    if numba_type in mapping:
+        return mapping[numba_type]
+    return _TypeEnum.STORAGE
+
+
+# TODO Extract into reusable module
+class _TypeInfo(ctypes.Structure):
+    _fields_ = [("size", ctypes.c_int),
+                ("alignment", ctypes.c_int),
+                ("type", _TypeEnum)]
+
+
+class _CCCLOp(ctypes.Structure):
+    _fields_ = [("type", _CCCLOpKindEnum),
+                ("name", ctypes.c_char_p),
+                ("ltoir", ctypes.c_char_p),
+                ("ltoir_size", ctypes.c_int),
+                ("size", ctypes.c_int),
+                ("alignment", ctypes.c_int),
+                ("state", ctypes.c_void_p)]
+
+
+class _CCCLIterator(ctypes.Structure):
+    _fields_ = [("size", ctypes.c_int),
+                ("alignment", ctypes.c_int),
+                ("type", _CCCLIteratorKindEnum),
+                ("advance", _CCCLOp),
+                ("dereference", _CCCLOp),
+                ("value_type", _TypeInfo),
+                ("state", ctypes.c_void_p)]
+
+
+class _CCCLValue(ctypes.Structure):
+    _fields_ = [("type", _TypeInfo),
+                ("state", ctypes.c_void_p)]
+
+
+def _type_to_info(numpy_type):
+    numba_type = numba.from_dtype(numpy_type)
+    context = cuda.descriptor.cuda_target.target_context
+    size = context.get_value_type(numba_type).get_abi_size(context.target_data)
+    alignment = context.get_value_type(
+        numba_type).get_abi_alignment(context.target_data)
+    return _TypeInfo(size, alignment, _type_to_enum(numba_type))
+
+
+def _device_array_to_pointer(array):
+    dtype = array.dtype
+    info = _type_to_info(dtype)
+    return _CCCLIterator(1, 1, _CCCLIteratorKindEnum.POINTER, _CCCLOp(), _CCCLOp(), info, array.device_ctypes_pointer.value)
+
+
+def _host_array_to_value(array):
+    dtype = array.dtype
+    info = _type_to_info(dtype)
+    return _CCCLValue(info, array.ctypes.data_as(ctypes.c_void_p))
+
+
+class _Op:
+    def __init__(self, dtype, op):
+        value_type = numba.from_dtype(dtype)
+        self.ltoir, _ = cuda.compile(op, sig=value_type(
+            value_type, value_type), output='ltoir')
+        self.name = op.__name__.encode('utf-8')
+
+    def handle(self):
+        return _CCCLOp(_CCCLOpKindEnum.STATELESS, self.name, ctypes.c_char_p(self.ltoir), len(self.ltoir), 1, 1, None)
+
+
+def _get_cuda_path():
+    cuda_path = os.environ.get('CUDA_PATH', '')
+    if os.path.exists(cuda_path):
+        return cuda_path
+
+    nvcc_path = shutil.which('nvcc')
+    if nvcc_path is not None:
+        return os.path.dirname(os.path.dirname(nvcc_path))
+
+    default_path = '/usr/local/cuda'
+    if os.path.exists(default_path):
+        return default_path
+
+    return None
+
+
+_bindings = None
+_paths = None
+
+
+def _get_bindings():
+    global _bindings
+    if _bindings is None:
+        include_path = importlib.resources.files(
+            'cuda.parallel.experimental').joinpath('cccl')
+        cccl_c_path = os.path.join(include_path, 'libcccl.c.so')
+        _bindings = ctypes.CDLL(cccl_c_path)
+        _bindings.cccl_device_reduce.restype = ctypes.c_int
+        _bindings.cccl_device_reduce.restype = ctypes.c_int
+        _bindings.cccl_device_reduce.argtypes = [_CCCLDeviceReduceBuildResult, ctypes.c_void_p, ctypes.POINTER(
+            ctypes.c_ulonglong), _CCCLIterator, _CCCLIterator, ctypes.c_ulonglong, _CCCLOp, _CCCLValue, ctypes.c_void_p]
+        _bindings.cccl_device_reduce_cleanup.restype = ctypes.c_int
+    return _bindings
+
+
+def _get_paths():
+    global _paths
+    if _paths is None:
+        include_path = importlib.resources.files('cuda').joinpath('_include')
+        include_path_str = str(include_path)
+        include_option = '-I' + include_path_str
+        cub_path = include_option.encode('utf-8')
+        thrust_path = cub_path
+        libcudacxx_path_str = str(os.path.join(include_path, 'libcudacxx'))
+        libcudacxx_option = '-I' + libcudacxx_path_str
+        libcudacxx_path = libcudacxx_option.encode('utf-8')
+        cuda_include_str = os.path.join(_get_cuda_path(), 'include')
+        cuda_include_option = '-I' + cuda_include_str
+        cuda_include_path = cuda_include_option.encode('utf-8')
+        _paths = cub_path, thrust_path, libcudacxx_path, cuda_include_path
+    return _paths
+
+
+class _CCCLDeviceReduceBuildResult(ctypes.Structure):
+    _fields_ = [("cc", ctypes.c_int),
+                ("cubin", ctypes.c_void_p),
+                ("cubin_size", ctypes.c_size_t),
+                ("library", ctypes.c_void_p),
+                ("single_tile_kernel", ctypes.c_void_p),
+                ("single_tile_second_kernel", ctypes.c_void_p),
+                ("reduction_kernel", ctypes.c_void_p)]
+
+
+class _Reduce:
+    def __init__(self, d_in, d_out, op, init):
+        cc_major, cc_minor = cuda.get_current_device().compute_capability
+        cub_path, thrust_path, libcudacxx_path, cuda_include_path = _get_paths()
+        bindings = _get_bindings()
+        accum_t = init.dtype
+        self.op_wrapper = _Op(accum_t, op)
+        d_in_ptr = _device_array_to_pointer(d_in)
+        d_out_ptr = _device_array_to_pointer(d_out)
+        self.build_result = _CCCLDeviceReduceBuildResult()
+
+        # TODO Figure out caching
+        error = bindings.cccl_device_reduce_build(ctypes.byref(self.build_result),
+                                                  d_in_ptr,
+                                                  d_out_ptr,
+                                                  self.op_wrapper.handle(),
+                                                  _host_array_to_value(init),
+                                                  cc_major,
+                                                  cc_minor,
+                                                  ctypes.c_char_p(cub_path),
+                                                  ctypes.c_char_p(thrust_path),
+                                                  ctypes.c_char_p(
+                                                      libcudacxx_path),
+                                                  ctypes.c_char_p(cuda_include_path))
+        if error != enums.CUDA_SUCCESS:
+            raise ValueError('Error building reduce')
+
+    def __call__(self, temp_storage, d_in, d_out, init):
+        # TODO Assert that types match the ones used in the constructor
+        bindings = _get_bindings()
+        if temp_storage is None:
+            temp_storage_bytes = ctypes.c_size_t()
+            d_temp_storage = None
+        else:
+            temp_storage_bytes = ctypes.c_size_t(temp_storage.nbytes)
+            d_temp_storage = temp_storage.device_ctypes_pointer.value
+        d_in_ptr = _device_array_to_pointer(d_in)
+        d_out_ptr = _device_array_to_pointer(d_out)
+        num_items = ctypes.c_ulonglong(d_in.size)
+        error = bindings.cccl_device_reduce(self.build_result,
+                                            d_temp_storage,
+                                            ctypes.byref(temp_storage_bytes),
+                                            d_in_ptr,
+                                            d_out_ptr,
+                                            num_items,
+                                            self.op_wrapper.handle(),
+                                            _host_array_to_value(init),
+                                            None)
+        if error != enums.CUDA_SUCCESS:
+            raise ValueError('Error reducing')
+
+        return temp_storage_bytes.value
+
+    def __del__(self):
+        bindings = _get_bindings()
+        bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
+
+
+# TODO Figure out iterators
+# TODO Figure out `sum` without operator and initial value
+# TODO Accept stream
+def reduce_into(d_in, d_out, op, init):
+    """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``.
+
+    Example:
+        The code snippet below illustrates a user-defined min-reduction of a
+        device vector of ``int`` data elements.
+
+        .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin imports
+            :end-before: example-end imports
+
+        Below is the code snippet that demonstrates the usage of the ``reduce_into`` API:
+
+        .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin reduce-min
+            :end-before: example-end reduce-min
+
+    Args:
+        d_in: CUDA device array storing the input sequence of data items
+        d_out: CUDA device array storing the output aggregate
+        op: Binary reduction
+        init: Numpy array storing initial value of the reduction
+
+    Returns:
+        A callable object that can be used to perform the reduction
+    """
+    return _Reduce(d_in, d_out, op, init)
diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
new file mode 100644
index 0000000000..4ab52c8031
--- /dev/null
+++ b/python/cuda_parallel/pyproject.toml
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+[build-system]
+requires = ["packaging", "setuptools>=61.0.0", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
new file mode 100644
index 0000000000..c29a5237fc
--- /dev/null
+++ b/python/cuda_parallel/setup.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import shutil
+import subprocess
+
+from setuptools import Command, Extension, setup, find_packages, find_namespace_packages
+from setuptools.command.build_py import build_py
+from setuptools.command.build_ext import build_ext
+from wheel.bdist_wheel import bdist_wheel
+
+
+project_path = os.path.abspath(os.path.dirname(__file__))
+cccl_path = os.path.abspath(os.path.join(project_path, "..", '..'))
+cccl_headers = [
+    ['cub', 'cub'],
+    ['libcudacxx', 'include'],
+    ['thrust', 'thrust']
+]
+with open(os.path.join(project_path, 'cuda', 'parallel', '_version.py')) as f:
+    exec(f.read())
+ver = __version__
+del __version__
+
+
+with open("README.md") as f:
+    long_description = f.read()
+
+
+class CustomBuildCommand(build_py):
+    def run(self):
+        self.run_command('package_cccl')
+        build_py.run(self)
+
+
+class CustomWheelBuild(bdist_wheel):
+
+    def run(self):
+        self.run_command('package_cccl')
+        super().run()
+
+
+class PackageCCCLCommand(Command):
+    description = 'Generate additional files'
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        for proj_dir, header_dir in cccl_headers:
+            src_path = os.path.abspath(
+                os.path.join(cccl_path, proj_dir, header_dir))
+            # TODO Extract cccl headers into a standalone package
+            dst_path = os.path.join(project_path, 'cuda', '_include', proj_dir)
+            if os.path.exists(dst_path):
+                shutil.rmtree(dst_path)
+            shutil.copytree(src_path, dst_path)
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name):
+        super().__init__(name, sources=[])
+
+
+class BuildCMakeExtension(build_ext):
+    def run(self):
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(
+            self.get_ext_fullpath(ext.name)))
+        cmake_args = [
+            '-DCCCL_ENABLE_CUB=YES',
+            '-DCCCL_ENABLE_THRUST=YES',
+            '-DCCCL_ENABLE_C=YES',
+            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+            '-DCMAKE_BUILD_TYPE=Release',
+        ]
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        subprocess.check_call(['cmake', cccl_path] +
+                              cmake_args, cwd=self.build_temp)
+        subprocess.check_call(
+            ['cmake', '--build', '.', '--target', 'cccl.c'], cwd=self.build_temp)
+
+
+setup(
+    name="cuda-parallel",
+    version=ver,
+    description="Experimental Core Library for CUDA Python",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author="NVIDIA Corporation",
+    classifiers=[
+        "Programming Language :: Python :: 3 :: Only",
+        "Environment :: GPU :: NVIDIA CUDA",
+    ],
+    packages=find_namespace_packages(include=['cuda.*']),
+    python_requires='>=3.9',
+    install_requires=[
+        "numba>=0.60.0",
+        "cuda-python",
+        "jinja2"
+    ],
+    extras_require={
+        "test": [
+            "pytest",
+        ]
+    },
+    cmdclass={
+        'package_cccl': PackageCCCLCommand,
+        'build_py': CustomBuildCommand,
+        'bdist_wheel': CustomWheelBuild,
+        'build_ext': BuildCMakeExtension
+    },
+    ext_modules=[CMakeExtension('cuda.parallel.experimental.cccl.c')],
+    include_package_data=True,
+    license="Apache-2.0 with LLVM exception",
+    license_files=('../../LICENSE',),
+)
diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py
new file mode 100644
index 0000000000..9f59f8efce
--- /dev/null
+++ b/python/cuda_parallel/tests/test_reduce.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import numpy
+import pytest
+from numba import cuda
+import cuda.parallel.experimental as cudax
+
+
+def random_int(shape, dtype):
+    return numpy.random.randint(0, 5, size=shape).astype(dtype)
+
+
+def type_to_problem_sizes(dtype):
+    if dtype in [numpy.uint8, numpy.int8]:
+        return [2, 4, 5, 6]
+    elif dtype in [numpy.uint16, numpy.int16]:
+        return [4, 8, 12, 14]
+    elif dtype in [numpy.uint32, numpy.int32]:
+        return [16, 20, 24, 28]
+    elif dtype in [numpy.uint64, numpy.int64]:
+        return [16, 20, 24, 28]
+    else:
+        raise ValueError("Unsupported dtype")
+
+
+@pytest.mark.parametrize('dtype', [numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64])
+def test_device_reduce(dtype):
+    def op(a, b):
+        return a + b
+
+    init_value = 42
+    h_init = numpy.array([init_value], dtype=dtype)
+    d_output = cuda.device_array(1, dtype=dtype)
+    reduce_into = cudax.reduce_into(d_output, d_output, op, h_init)
+
+    for num_items_pow2 in type_to_problem_sizes(dtype):
+        num_items = 2 ** num_items_pow2
+        h_input = random_int(num_items, dtype)
+        d_input = cuda.to_device(h_input)
+        temp_storage_size = reduce_into(None, d_input, d_output, h_init)
+        d_temp_storage = cuda.device_array(
+            temp_storage_size, dtype=numpy.uint8)
+        reduce_into(d_temp_storage, d_input, d_output, h_init)
+        h_output = d_output.copy_to_host()
+        assert h_output[0] == sum(h_input) + init_value
+
+
+def test_complex_device_reduce():
+    def op(a, b):
+        return a + b
+
+    h_init = numpy.array([40.0 + 2.0j], dtype=complex)
+    d_output = cuda.device_array(1, dtype=complex)
+    reduce_into = cudax.reduce_into(d_output, d_output, op, h_init)
+
+    for num_items in [42, 420000]:
+        h_input = numpy.random.random(
+            num_items) + 1j * numpy.random.random(num_items)
+        d_input = cuda.to_device(h_input)
+        temp_storage_bytes = reduce_into(None, d_input, d_output, h_init)
+        d_temp_storage = cuda.device_array(temp_storage_bytes, numpy.uint8)
+        reduce_into(d_temp_storage, d_input, d_output, h_init)
+
+        result = d_output.copy_to_host()[0]
+        expected = numpy.sum(h_input, initial=h_init[0])
+        assert result == pytest.approx(expected)
diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py
new file mode 100644
index 0000000000..6ed3583121
--- /dev/null
+++ b/python/cuda_parallel/tests/test_reduce_api.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import numpy
+import pytest
+from numba import cuda
+
+# example-begin imports
+import cuda.parallel.experimental as cudax
+# example-end imports
+
+
+def test_device_reduce():
+    # example-begin reduce-min
+    def op(a, b):
+        return a if a < b else b
+
+    dtype = numpy.int32
+    h_init = numpy.array([42], dtype)
+    h_input = numpy.array([8, 6, 7, 5, 3, 0, 9])
+    d_output = cuda.device_array(1, dtype)
+    d_input = cuda.to_device(h_input)
+
+    # Instantiate reduction for the given operator and initial value
+    reduce_into = cudax.reduce_into(d_output, d_output, op, h_init)
+
+    # Deterrmine temporary device storage requirements
+    temp_storage_size = reduce_into(None, d_input, d_output, h_init)
+
+    # Allocate temporary storage
+    d_temp_storage = cuda.device_array(temp_storage_size, dtype=numpy.uint8)
+
+    # Run reduction
+    reduce_into(d_temp_storage, d_input, d_output, h_init)
+
+    expected_output = 0
+    # example-end reduce-min
+    h_output = d_output.copy_to_host()
+    assert h_output[0] == expected_output
diff --git a/thrust/CMakeLists.txt b/thrust/CMakeLists.txt
index ca372d3677..a70d8a1c69 100644
--- a/thrust/CMakeLists.txt
+++ b/thrust/CMakeLists.txt
@@ -59,6 +59,10 @@ option(THRUST_ENABLE_TESTING "Build Thrust testing suite." "ON")
 option(THRUST_ENABLE_EXAMPLES "Build Thrust examples." "ON")
 option(THRUST_ENABLE_BENCHMARKS "Build Thrust runtime benchmarks." "${CCCL_ENABLE_BENCHMARKS}")
 
+# Allow the user to optionally select offset type dispatch to fixed 32 or 64 bit types
+set(THRUST_DISPATCH_TYPE "Dynamic" CACHE STRING "Select Thrust offset type dispatch." FORCE)
+set_property(CACHE THRUST_DISPATCH_TYPE PROPERTY STRINGS "Dynamic" "Force32bit" "Force64bit")
+
 # Check if we're actually building anything before continuing. If not, no need
 # to search for deps, etc. This is a common approach for packagers that just
 # need the install rules. See GH issue NVIDIA/thrust#1211.
diff --git a/thrust/cmake/ThrustBuildCompilerTargets.cmake b/thrust/cmake/ThrustBuildCompilerTargets.cmake
index 158ca04faf..c5be03ecf5 100644
--- a/thrust/cmake/ThrustBuildCompilerTargets.cmake
+++ b/thrust/cmake/ThrustBuildCompilerTargets.cmake
@@ -131,6 +131,12 @@ function(thrust_build_compiler_targets)
     )
   endforeach()
 
+  if (THRUST_DISPATCH_TYPE STREQUAL "Force32bit")
+    list(APPEND cxx_compile_definitions "THRUST_FORCE_32_BIT_OFFSET_TYPE")
+  elseif (THRUST_DISPATCH_TYPE STREQUAL "Force64bit")
+    list(APPEND cxx_compile_definitions "THRUST_FORCE_64_BIT_OFFSET_TYPE")
+  endif()
+
   foreach (cxx_definition IN LISTS cxx_compile_definitions)
     # Add these for both CUDA and CXX targets:
     target_compile_definitions(thrust.compiler_interface INTERFACE
diff --git a/thrust/cmake/ThrustHeaderTesting.cmake b/thrust/cmake/ThrustHeaderTesting.cmake
index 4c1d07f744..5a10912864 100644
--- a/thrust/cmake/ThrustHeaderTesting.cmake
+++ b/thrust/cmake/ThrustHeaderTesting.cmake
@@ -146,6 +146,19 @@ foreach(thrust_target IN LISTS THRUST_TARGETS)
     "CUB_WRAPPED_NAMESPACE=wrapped_cub")
   thrust_add_header_test(${thrust_target} base "${header_definitions}")
 
+  # We need to ensure that the different dispatch mechanisms work
+  set(header_definitions
+    "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+    "CUB_WRAPPED_NAMESPACE=wrapped_cub"
+    "THRUST_FORCE_32_BIT_OFFSET_TYPE")
+  thrust_add_header_test(${thrust_target} offset_32 "${header_definitions}")
+
+  set(header_definitions
+    "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+    "CUB_WRAPPED_NAMESPACE=wrapped_cub"
+    "THRUST_FORCE_64_BIT_OFFSET_TYPE")
+  thrust_add_header_test(${thrust_target} offset_64 "${header_definitions}")
+
   thrust_get_target_property(config_device ${thrust_target} DEVICE)
   if ("CUDA" STREQUAL "${config_device}")
     # Check that BF16 support can be disabled
diff --git a/thrust/cmake/header_test.in b/thrust/cmake/header_test.in
index 236cb9bde4..292fe16526 100644
--- a/thrust/cmake/header_test.in
+++ b/thrust/cmake/header_test.in
@@ -17,7 +17,7 @@
   THRUST_MACRO_CHECK_IMPL(Identifier MACRO should not be used from Thrust      \
                           headers due to conflicts with HEADER macros.)
 
-// Use raw platform checks instead of the THRUST_HOST_COMPILER macros since we
+// Use raw platform macros instead of the CCCL macros since we
 // don't want to #include any headers other than the one being tested.
 //
 // This is only implemented for MSVC/GCC/Clang.
diff --git a/thrust/examples/include/host_device.h b/thrust/examples/include/host_device.h
index ad02b1324b..7d7d4e68ea 100644
--- a/thrust/examples/include/host_device.h
+++ b/thrust/examples/include/host_device.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
+#ifndef _CCCL_CUDA_COMPILER
 
 #  ifndef __host__
 #    define __host__
diff --git a/thrust/testing/allocator.cu b/thrust/testing/allocator.cu
index f76f001c02..b64f3b6912 100644
--- a/thrust/testing/allocator.cu
+++ b/thrust/testing/allocator.cu
@@ -87,7 +87,7 @@ struct my_allocator_with_custom_destroy
 
   _CCCL_HOST ~my_allocator_with_custom_destroy() {}
 
-  _CCCL_HOST_DEVICE void destroy(T*)
+  _CCCL_HOST_DEVICE void destroy(T*) noexcept
   {
     NV_IF_TARGET(NV_IS_HOST, (g_state = true;));
   }
@@ -97,7 +97,7 @@ struct my_allocator_with_custom_destroy
     return use_me_to_alloc.allocate(n);
   }
 
-  void deallocate(value_type* ptr, std::ptrdiff_t n)
+  void deallocate(value_type* ptr, std::ptrdiff_t n) noexcept
   {
     use_me_to_alloc.deallocate(ptr, n);
   }
@@ -161,7 +161,7 @@ struct my_minimal_allocator
     return use_me_to_alloc.allocate(n);
   }
 
-  void deallocate(value_type* ptr, std::ptrdiff_t n)
+  void deallocate(value_type* ptr, std::ptrdiff_t n) noexcept
   {
     use_me_to_alloc.deallocate(ptr, n);
   }
diff --git a/thrust/testing/async/inclusive_scan/mixin.h b/thrust/testing/async/inclusive_scan/mixin.h
index b0490d573c..f3538f5eb9 100644
--- a/thrust/testing/async/inclusive_scan/mixin.h
+++ b/thrust/testing/async/inclusive_scan/mixin.h
@@ -19,6 +19,24 @@ namespace inclusive_scan
 namespace mixin
 {
 
+namespace postfix_args_init
+{
+
+template <typename ValueType, typename AltBinaryOp = thrust::maximum<>>
+struct all_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<ValueType, AltBinaryOp> // - initial_value with binary_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::make_tuple(ValueType{42}, AltBinaryOp{})};
+  }
+};
+
+} // namespace postfix_args_init
+
 //------------------------------------------------------------------------------
 namespace postfix_args
 {
diff --git a/thrust/testing/async/inclusive_scan/simple.cu b/thrust/testing/async/inclusive_scan/simple.cu
index fd93120d28..4f7ffcd90b 100644
--- a/thrust/testing/async/inclusive_scan/simple.cu
+++ b/thrust/testing/async/inclusive_scan/simple.cu
@@ -5,6 +5,34 @@
 #  include <async/inclusive_scan/mixin.h>
 #  include <async/test_policy_overloads.h>
 
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename value_type          = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct simple_init_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args_init::all_overloads<value_type, alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "simple invocation with device vectors and initial value";
+  }
+};
+
+template <typename T>
+struct test_simple_init
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<simple_init_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_init, NumericTypes);
+
 template <typename input_value_type,
           typename output_value_type   = input_value_type,
           typename alternate_binary_op = thrust::maximum<>>
diff --git a/thrust/testing/async_sort.cu b/thrust/testing/async_sort.cu
index 1cb4200cdc..feb5cb5624 100644
--- a/thrust/testing/async_sort.cu
+++ b/thrust/testing/async_sort.cu
@@ -1,7 +1,7 @@
 #include <thrust/detail/config.h>
 
 // Disabled on MSVC && NVCC < 11.1 for GH issue #1098.
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && defined(__CUDACC__)
+#if defined(_CCCL_COMPILER_MSVC) && defined(__CUDACC__)
 #  if (__CUDACC_VER_MAJOR__ < 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 1)
 #    define THRUST_BUG_1098_ACTIVE
 #  endif // NVCC version check
diff --git a/thrust/testing/cuda/scan.cu b/thrust/testing/cuda/scan.cu
index 5e2da11a67..5b2606553a 100644
--- a/thrust/testing/cuda/scan.cu
+++ b/thrust/testing/cuda/scan.cu
@@ -12,6 +12,13 @@ __global__ void inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Ite
   thrust::inclusive_scan(exec, first, last, result);
 }
 
+template <typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename T, typename Pred>
+__global__ void
+inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result, T init, Pred pred)
+{
+  thrust::inclusive_scan(exec, first, last, result, init, pred);
+}
+
 template <typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__ void exclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
 {
@@ -43,6 +50,16 @@ void TestScanDevice(ExecutionPolicy exec, const size_t n)
 
   ASSERT_EQUAL(d_output, h_output);
 
+  thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), (T) 11, thrust::plus<T>{});
+
+  inclusive_scan_kernel<<<1, 1>>>(exec, d_input.begin(), d_input.end(), d_output.begin(), (T) 11, thrust::plus<T>{});
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(d_output, h_output);
+
   thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
 
   exclusive_scan_kernel<<<1, 1>>>(exec, d_input.begin(), d_input.end(), d_output.begin());
@@ -186,6 +203,20 @@ void TestScanCudaStreams()
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
 
+  // inclusive scan with init and op
+  iter =
+    thrust::inclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), 3, thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result[0] = 4;
+  result[1] = 7;
+  result[2] = 5;
+  result[3] = 9;
+  result[4] = 4;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input, input_copy);
+  ASSERT_EQUAL(output, result);
+
   // exclusive scan with init and op
   iter =
     thrust::exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), 3, thrust::plus<T>());
diff --git a/thrust/testing/cuda/sort.cu b/thrust/testing/cuda/sort.cu
index 8e7e5542e7..6962a39640 100644
--- a/thrust/testing/cuda/sort.cu
+++ b/thrust/testing/cuda/sort.cu
@@ -9,6 +9,7 @@
 
 #include <cuda/std/limits>
 
+#include <algorithm>
 #include <cstdint>
 #include <exception>
 
@@ -164,9 +165,23 @@ struct TestRadixSortDispatch
 
   void operator()() const {}
 };
-// TODO(bgruber): use a single test case with a concatenated key list and a cartesion product with the comparators
-SimpleUnitTest<TestRadixSortDispatch, IntegralTypes> TestRadixSortDispatchIntegralInstance;
-SimpleUnitTest<TestRadixSortDispatch, FloatingPointTypes> TestRadixSortDispatchFPInstance;
+SimpleUnitTest<TestRadixSortDispatch,
+               unittest::concat<IntegralTypes,
+                                FloatingPointTypes
+#ifndef _LIBCUDACXX_HAS_NO_INT128
+                                ,
+                                unittest::type_list<__int128_t, __uint128_t>
+#endif // _LIBCUDACXX_HAS_NO_INT128
+#ifdef _CCCL_HAS_NVFP16
+                                ,
+                                unittest::type_list<__half>
+#endif // _CCCL_HAS_NVFP16
+#ifdef _CCCL_HAS_NVBF16
+                                ,
+                                unittest::type_list<__nv_bfloat16>
+#endif // _CCCL_HAS_NVBF16
+                                >>
+  TestRadixSortDispatchInstance;
 
 /**
  * Copy of CUB testing utility
@@ -263,3 +278,41 @@ void TestSortWithLargeNumberOfItems()
   TestSortWithMagnitude(33);
 }
 DECLARE_UNITTEST(TestSortWithLargeNumberOfItems);
+
+template <typename T>
+struct TestSortAscendingKey
+{
+  void operator()() const
+  {
+    constexpr int n = 10000;
+
+    thrust::host_vector<T> h_data   = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    std::sort(h_data.begin(), h_data.end(), thrust::less<T>{});
+    thrust::sort(d_data.begin(), d_data.end(), thrust::less<T>{});
+
+    ASSERT_EQUAL_QUIET(h_data, d_data);
+  }
+};
+
+SimpleUnitTest<TestSortAscendingKey,
+               unittest::concat<unittest::type_list<>
+#ifndef _LIBCUDACXX_HAS_NO_INT128
+                                ,
+                                unittest::type_list<__int128_t, __uint128_t>
+#endif
+// CTK 12.2 offers __host__ __device__ operators for __half and __nv_bfloat16, so we can use std::sort
+#if _CCCL_CUDACC_VER >= 1202000
+#  if defined(_CCCL_HAS_NVFP16) || !defined(__CUDA_NO_HALF_OPERATORS__) && !defined(__CUDA_NO_HALF_CONVERSIONS__)
+                                ,
+                                unittest::type_list<__half>
+#  endif
+#  if defined(_CCCL_HAS_NVBF16) \
+    || !defined(__CUDA_NO_BFLOAT16_OPERATORS__) && !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+                                ,
+                                unittest::type_list<__nv_bfloat16>
+#  endif
+#endif // _CCCL_CUDACC_VER >= 1202000
+                                >>
+  TestSortAscendingKeyMoreTypes;
diff --git a/thrust/testing/cuda/transform_scan.cu b/thrust/testing/cuda/transform_scan.cu
index fc6f85fc6d..935d73056a 100644
--- a/thrust/testing/cuda/transform_scan.cu
+++ b/thrust/testing/cuda/transform_scan.cu
@@ -22,6 +22,26 @@ __global__ void transform_inclusive_scan_kernel(
   *result2 = thrust::transform_inclusive_scan(exec, first, last, result1, f1, f2);
 }
 
+template <typename ExecutionPolicy,
+          typename Iterator1,
+          typename Iterator2,
+          typename Function1,
+          typename T,
+          typename Function2,
+          typename Iterator3>
+__global__ void transform_inclusive_scan_init_kernel(
+  ExecutionPolicy exec,
+  Iterator1 first,
+  Iterator1 last,
+  Iterator2 result1,
+  Function1 f1,
+  T init,
+  Function2 f2,
+  Iterator3 result2)
+{
+  *result2 = thrust::transform_inclusive_scan(exec, first, last, result1, f1, init, f2);
+}
+
 template <typename ExecutionPolicy,
           typename Iterator1,
           typename Iterator2,
@@ -50,16 +70,10 @@ void TestTransformScanDevice(ExecutionPolicy exec)
 
   typename Vector::iterator iter;
 
-  Vector input(5);
-  Vector ref(5);
+  Vector input{1, 3, -2, 4, -5};
+  Vector ref{-1, -4, -2, -6, -1};
   Vector output(5);
 
-  input[0] = 1;
-  input[1] = 3;
-  input[2] = -2;
-  input[3] = 4;
-  input[4] = -5;
-
   Vector input_copy(input);
 
   thrust::device_vector<typename Vector::iterator> iter_vec(1);
@@ -72,12 +86,21 @@ void TestTransformScanDevice(ExecutionPolicy exec)
     ASSERT_EQUAL(cudaSuccess, err);
   }
 
-  iter   = iter_vec[0];
-  ref[0] = -1;
-  ref[1] = -4;
-  ref[2] = -2;
-  ref[3] = -6;
-  ref[4] = -1;
+  iter = iter_vec[0];
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input, input_copy);
+  ASSERT_EQUAL(ref, output);
+
+  // inclusive scan with nonzero init
+  transform_inclusive_scan_init_kernel<<<1, 1>>>(
+    exec, input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  iter = iter_vec[0];
+  ref  = {2, -1, 1, -3, 2};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(ref, output);
@@ -90,11 +113,7 @@ void TestTransformScanDevice(ExecutionPolicy exec)
     ASSERT_EQUAL(cudaSuccess, err);
   }
 
-  ref[0] = 0;
-  ref[1] = -1;
-  ref[2] = -4;
-  ref[3] = -2;
-  ref[4] = -6;
+  ref = {0, -1, -4, -2, -6};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(ref, output);
@@ -107,12 +126,8 @@ void TestTransformScanDevice(ExecutionPolicy exec)
     ASSERT_EQUAL(cudaSuccess, err);
   }
 
-  iter   = iter_vec[0];
-  ref[0] = 3;
-  ref[1] = 2;
-  ref[2] = -1;
-  ref[3] = 1;
-  ref[4] = -3;
+  iter = iter_vec[0];
+  ref  = {3, 2, -1, 1, -3};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(ref, output);
@@ -126,12 +141,22 @@ void TestTransformScanDevice(ExecutionPolicy exec)
     ASSERT_EQUAL(cudaSuccess, err);
   }
 
-  iter   = iter_vec[0];
-  ref[0] = -1;
-  ref[1] = -4;
-  ref[2] = -2;
-  ref[3] = -6;
-  ref[4] = -1;
+  iter = iter_vec[0];
+  ref  = {-1, -4, -2, -6, -1};
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(ref, input);
+
+  // inplace inclusive scan with init
+  input = input_copy;
+  transform_inclusive_scan_init_kernel<<<1, 1>>>(
+    exec, input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  iter = iter_vec[0];
+  ref  = {2, -1, 1, -3, 2};
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(ref, input);
 
@@ -144,12 +169,8 @@ void TestTransformScanDevice(ExecutionPolicy exec)
     ASSERT_EQUAL(cudaSuccess, err);
   }
 
-  iter   = iter_vec[0];
-  ref[0] = 3;
-  ref[1] = 2;
-  ref[2] = -1;
-  ref[3] = 1;
-  ref[4] = -3;
+  iter = iter_vec[0];
+  ref  = {3, 2, -1, 1, -3};
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(ref, input);
 }
@@ -174,16 +195,10 @@ void TestTransformScanCudaStreams()
 
   Vector::iterator iter;
 
-  Vector input(5);
-  Vector result(5);
+  Vector input{1, 3, -2, 4, -5};
+  Vector result{-1, -4, -2, -6, -1};
   Vector output(5);
 
-  input[0] = 1;
-  input[1] = 3;
-  input[2] = -2;
-  input[3] = 4;
-  input[4] = -5;
-
   Vector input_copy(input);
 
   cudaStream_t s;
@@ -194,11 +209,16 @@ void TestTransformScanCudaStreams()
     thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::negate<T>(), thrust::plus<T>());
   cudaStreamSynchronize(s);
 
-  result[0] = -1;
-  result[1] = -4;
-  result[2] = -2;
-  result[3] = -6;
-  result[4] = -1;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input, input_copy);
+  ASSERT_EQUAL(output, result);
+
+  // inclusive scan with nonzero init
+  iter = thrust::transform_inclusive_scan(
+    thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result = {2, -1, 1, -3, 2};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
@@ -208,11 +228,7 @@ void TestTransformScanCudaStreams()
     thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::negate<T>(), 0, thrust::plus<T>());
   cudaStreamSynchronize(s);
 
-  result[0] = 0;
-  result[1] = -1;
-  result[2] = -4;
-  result[3] = -2;
-  result[4] = -6;
+  result = {0, -1, -4, -2, -6};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
@@ -222,11 +238,7 @@ void TestTransformScanCudaStreams()
     thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
   cudaStreamSynchronize(s);
 
-  result[0] = 3;
-  result[1] = 2;
-  result[2] = -1;
-  result[3] = 1;
-  result[4] = -3;
+  result = {3, 2, -1, 1, -3};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
@@ -237,11 +249,17 @@ void TestTransformScanCudaStreams()
     thrust::cuda::par.on(s), input.begin(), input.end(), input.begin(), thrust::negate<T>(), thrust::plus<T>());
   cudaStreamSynchronize(s);
 
-  result[0] = -1;
-  result[1] = -4;
-  result[2] = -2;
-  result[3] = -6;
-  result[4] = -1;
+  result = {-1, -4, -2, -6, -1};
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(input, result);
+
+  // inplace inclusive scan with init
+  input = input_copy;
+  iter  = thrust::transform_inclusive_scan(
+    thrust::cuda::par.on(s), input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
+  cudaStreamSynchronize(s);
+
+  result = {2, -1, 1, -3, 2};
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
@@ -251,11 +269,7 @@ void TestTransformScanCudaStreams()
     thrust::cuda::par.on(s), input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
   cudaStreamSynchronize(s);
 
-  result[0] = 3;
-  result[1] = 2;
-  result[2] = -1;
-  result[3] = 1;
-  result[4] = -3;
+  result = {3, 2, -1, 1, -3};
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
@@ -270,16 +284,10 @@ void TestTransformScanConstAccumulator()
 
   Vector::iterator iter;
 
-  Vector input(5);
+  Vector input{1, 3, -2, 4, -5};
   Vector reference(5);
   Vector output(5);
 
-  input[0] = 1;
-  input[1] = 3;
-  input[2] = -2;
-  input[3] = 4;
-  input[4] = -5;
-
   thrust::transform_inclusive_scan(input.begin(), input.end(), output.begin(), thrust::identity<T>(), thrust::plus<T>());
   thrust::inclusive_scan(input.begin(), input.end(), reference.begin(), thrust::plus<T>());
 
diff --git a/thrust/testing/functional_placeholders_miscellaneous.cu b/thrust/testing/functional_placeholders_miscellaneous.cu
index 9362e81d72..9af8065a9e 100644
--- a/thrust/testing/functional_placeholders_miscellaneous.cu
+++ b/thrust/testing/functional_placeholders_miscellaneous.cu
@@ -80,17 +80,32 @@ VectorUnitTest<TestFunctionalPlaceholdersTransformIterator,
 VectorUnitTest<TestFunctionalPlaceholdersTransformIterator, ThirtyTwoBitTypes, thrust::host_vector, std::allocator>
   TestFunctionalPlaceholdersTransformIteratorInstanceHost;
 
-template <typename T>
-struct TestFunctionalPlaceholdersArgumentValueCategories
+void TestFunctionalPlaceholdersArgumentValueCategories()
 {
-  void operator()() const
-  {
-    using namespace thrust::placeholders;
-    auto expr = _1 * _1 + _2 * _2;
-    T a       = 2;
-    T b       = 3;
-    ASSERT_ALMOST_EQUAL(expr(2, 3), 13); // pass pr-value
-    ASSERT_ALMOST_EQUAL(expr(a, b), 13); // pass l-value
-    ASSERT_ALMOST_EQUAL(expr(::cuda::std::move(a), ::cuda::std::move(b)), 13); // pass x-value
-  }
-};
+  using namespace thrust::placeholders;
+  auto expr = _1 * _1 + _2 * _2;
+  int a     = 2;
+  int b     = 3;
+  ASSERT_EQUAL(expr(2, 3), 13); // pass pr-value
+  ASSERT_EQUAL(expr(a, b), 13); // pass l-value
+  ASSERT_EQUAL(expr(::cuda::std::move(a), ::cuda::std::move(b)), 13); // pass x-value
+}
+DECLARE_UNITTEST(TestFunctionalPlaceholdersArgumentValueCategories);
+
+void TestFunctionalPlaceholdersSemiRegular()
+{
+  using namespace thrust::placeholders;
+  using Expr = decltype(_1 * _1 + _2 * _2);
+  Expr expr; // default-constructible
+  ASSERT_EQUAL(expr(2, 3), 13);
+  Expr expr2 = expr; // copy-constructible
+  ASSERT_EQUAL(expr2(2, 3), 13);
+  Expr expr3;
+  expr3 = expr; // copy-assignable
+  ASSERT_EQUAL(expr3(2, 3), 13);
+
+#if _CCCL_STD_VER >= 2014
+  static_assert(::cuda::std::semiregular<Expr>, "");
+#endif // _CCCL_STD_VER >= 2014
+}
+DECLARE_UNITTEST(TestFunctionalPlaceholdersSemiRegular);
diff --git a/thrust/testing/scan.cu b/thrust/testing/scan.cu
index 42e60b9547..e657d637a5 100644
--- a/thrust/testing/scan.cu
+++ b/thrust/testing/scan.cu
@@ -74,6 +74,13 @@ void TestScanSimple()
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
 
+  // inclusive scan with init and op
+  iter   = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), T(-1), thrust::multiplies<T>());
+  result = {-1, -3, 6, 24, -120};
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input, input_copy);
+  ASSERT_EQUAL(output, result);
+
   // exclusive scan with init and op
   iter   = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus<T>());
   result = {3, 4, 7, 5, 9};
@@ -88,6 +95,13 @@ void TestScanSimple()
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
+  // inplace inclusive scan with init and op
+  input  = input_copy;
+  iter   = thrust::inclusive_scan(input.begin(), input.end(), input.begin(), T(3), thrust::plus<T>());
+  result = {4, 7, 5, 9, 4};
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(input, result);
+
   // inplace exclusive scan with init
   input  = input_copy;
   iter   = thrust::exclusive_scan(input.begin(), input.end(), input.begin(), T(3));
diff --git a/thrust/testing/tabulate_output_iterator.cu b/thrust/testing/tabulate_output_iterator.cu
new file mode 100644
index 0000000000..789ed6cf04
--- /dev/null
+++ b/thrust/testing/tabulate_output_iterator.cu
@@ -0,0 +1,150 @@
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <cuda/std/type_traits>
+
+#include <unittest/unittest.h>
+
+template <typename OutItT>
+struct host_write_op
+{
+  OutItT out;
+
+  template <typename IndexT, typename T>
+  _CCCL_HOST void operator()(IndexT index, T val)
+  {
+    out[index] = val;
+  }
+};
+
+template <typename OutItT>
+struct host_write_first_op
+{
+  OutItT out;
+
+  template <typename IndexT, typename T>
+  _CCCL_HOST void operator()(IndexT index, T val)
+  {
+    // val is a thrust::tuple(value, input_index). Only write out the value part.
+    out[index] = thrust::get<0>(val);
+  }
+};
+
+template <typename OutItT>
+struct device_write_first_op
+{
+  OutItT out;
+
+  template <typename IndexT, typename T>
+  _CCCL_DEVICE void operator()(IndexT index, T val)
+  {
+    // val is a thrust::tuple(value, input_index). Only write out the value part.
+    out[index] = thrust::get<0>(val);
+  }
+};
+
+struct select_op
+{
+  std::size_t select_every_nth;
+
+  template <typename T, typename IndexT>
+  _CCCL_HOST_DEVICE bool operator()(thrust::tuple<T, IndexT> key_index_pair)
+  {
+    // Select every n-th item
+    return (thrust::get<1>(key_index_pair) % select_every_nth == 0);
+  }
+};
+
+struct index_to_gather_index_op
+{
+  std::size_t gather_stride;
+
+  template <typename IndexT>
+  _CCCL_HOST_DEVICE IndexT operator()(IndexT index)
+  {
+    // Gather the i-th output item from input[i*3]
+    return index * static_cast<IndexT>(gather_stride);
+  }
+};
+
+template <class Vector>
+void TestTabulateOutputIterator()
+{
+  using T     = typename Vector::value_type;
+  using it_t  = typename Vector::iterator;
+  using space = typename thrust::iterator_system<typename Vector::iterator>::type;
+
+  static constexpr std::size_t num_items = 240;
+  Vector input(num_items);
+  Vector output(num_items, T{42});
+
+  // Use operator type that supports the targeted system
+  using op_t = typename ::cuda::std::conditional<(::cuda::std::is_same<space, thrust::host_system_tag>::value),
+                                                 host_write_first_op<it_t>,
+                                                 device_write_first_op<it_t>>::type;
+
+  // Construct tabulate_output_iterator
+  op_t op{output.begin()};
+  auto tabulate_out_it = thrust::make_tabulate_output_iterator(op);
+
+  // Prepare input
+  thrust::sequence(input.begin(), input.end(), 1);
+  auto iota_it   = thrust::make_counting_iterator(0);
+  auto zipped_in = thrust::make_zip_iterator(input.begin(), iota_it);
+
+  // Run copy_if using tabulate_output_iterator as the output iterator
+  static constexpr std::size_t select_every_nth = 3;
+  auto selected_it_end =
+    thrust::copy_if(zipped_in, zipped_in + num_items, tabulate_out_it, select_op{select_every_nth});
+  const auto num_selected = static_cast<std::size_t>(thrust::distance(tabulate_out_it, selected_it_end));
+
+  // Prepare expected data
+  Vector expected_output(num_items, T{42});
+  const std::size_t expected_num_selected = (num_items + select_every_nth - 1) / select_every_nth;
+  auto gather_index_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(0), index_to_gather_index_op{select_every_nth});
+  thrust::gather(gather_index_it, gather_index_it + expected_num_selected, input.cbegin(), expected_output.begin());
+
+  ASSERT_EQUAL(expected_num_selected, num_selected);
+  ASSERT_EQUAL(output, expected_output);
+}
+DECLARE_VECTOR_UNITTEST(TestTabulateOutputIterator);
+
+void TestTabulateOutputIterator()
+{
+  using vector_t = thrust::host_vector<int>;
+  using vec_it_t = typename vector_t::iterator;
+  using op_t     = host_write_op<vec_it_t>;
+
+  vector_t out(4, 42);
+  thrust::tabulate_output_iterator<op_t> tabulate_out_it{op_t{out.begin()}};
+
+  tabulate_out_it[1] = 2;
+  ASSERT_EQUAL(out[0], 42);
+  ASSERT_EQUAL(out[1], 2);
+  ASSERT_EQUAL(out[2], 42);
+  ASSERT_EQUAL(out[3], 42);
+
+  tabulate_out_it[3] = 0;
+  ASSERT_EQUAL(out[0], 42);
+  ASSERT_EQUAL(out[1], 2);
+  ASSERT_EQUAL(out[2], 42);
+  ASSERT_EQUAL(out[3], 0);
+
+  tabulate_out_it[1] = 4;
+  ASSERT_EQUAL(out[0], 42);
+  ASSERT_EQUAL(out[1], 4);
+  ASSERT_EQUAL(out[2], 42);
+  ASSERT_EQUAL(out[3], 0);
+}
+
+DECLARE_UNITTEST(TestTabulateOutputIterator);
diff --git a/thrust/testing/transform_iterator.cu b/thrust/testing/transform_iterator.cu
index 7bb87d4625..53bc4db00a 100644
--- a/thrust/testing/transform_iterator.cu
+++ b/thrust/testing/transform_iterator.cu
@@ -6,6 +6,7 @@
 #include <thrust/sequence.h>
 
 #include <memory>
+#include <vector>
 
 #include <unittest/unittest.h>
 
@@ -108,3 +109,104 @@ void TestTransformIteratorNonCopyable()
 }
 
 DECLARE_UNITTEST(TestTransformIteratorNonCopyable);
+
+struct flip_value
+{
+  _CCCL_HOST_DEVICE bool operator()(bool b) const
+  {
+    return !b;
+  }
+};
+
+struct pass_ref
+{
+  _CCCL_HOST_DEVICE const bool& operator()(const bool& b) const
+  {
+    return b;
+  }
+};
+
+// TODO(bgruber): replace by libc++ with C++14
+struct forward
+{
+  template <class _Tp>
+  constexpr _Tp&& operator()(_Tp&& __t) const noexcept
+  {
+    return _CUDA_VSTD::forward<_Tp>(__t);
+  }
+};
+
+void TestTransformIteratorReferenceAndValueType()
+{
+  using ::cuda::std::is_same;
+  using ::cuda::std::negate;
+  {
+    thrust::host_vector<bool> v;
+
+    auto it = v.begin();
+    static_assert(is_same<decltype(it)::reference, bool&>::value, ""); // ordinary reference
+    static_assert(is_same<decltype(it)::value_type, bool>::value, "");
+
+    auto it_tr_val = thrust::make_transform_iterator(it, flip_value{});
+    static_assert(is_same<decltype(it_tr_val)::reference, bool>::value, "");
+    static_assert(is_same<decltype(it_tr_val)::value_type, bool>::value, "");
+    (void) it_tr_val;
+
+    auto it_tr_ref = thrust::make_transform_iterator(it, pass_ref{});
+    static_assert(is_same<decltype(it_tr_ref)::reference, const bool&>::value, "");
+    static_assert(is_same<decltype(it_tr_ref)::value_type, bool>::value, "");
+    (void) it_tr_ref;
+
+    auto it_tr_fwd = thrust::make_transform_iterator(it, forward{});
+    static_assert(is_same<decltype(it_tr_fwd)::reference, bool&&>::value, "");
+    static_assert(is_same<decltype(it_tr_fwd)::value_type, bool>::value, "");
+    (void) it_tr_fwd;
+  }
+
+  {
+    thrust::device_vector<bool> v;
+
+    auto it = v.begin();
+    static_assert(is_same<decltype(it)::reference, thrust::device_reference<bool>>::value, ""); // proxy reference
+    static_assert(is_same<decltype(it)::value_type, bool>::value, "");
+
+    auto it_tr_val = thrust::make_transform_iterator(it, flip_value{});
+    static_assert(is_same<decltype(it_tr_val)::reference, bool>::value, "");
+    static_assert(is_same<decltype(it_tr_val)::value_type, bool>::value, "");
+    (void) it_tr_val;
+
+    auto it_tr_ref = thrust::make_transform_iterator(it, pass_ref{});
+    static_assert(is_same<decltype(it_tr_ref)::reference, const bool&>::value, "");
+    static_assert(is_same<decltype(it_tr_ref)::value_type, bool>::value, "");
+    (void) it_tr_ref;
+
+    auto it_tr_fwd = thrust::make_transform_iterator(it, forward{});
+    static_assert(is_same<decltype(it_tr_fwd)::reference, bool&&>::value, ""); // wrapped reference is decayed
+    static_assert(is_same<decltype(it_tr_fwd)::value_type, bool>::value, "");
+    (void) it_tr_fwd;
+  }
+
+  {
+    std::vector<bool> v;
+
+    auto it = v.begin();
+    static_assert(is_same<decltype(it)::reference, std::vector<bool>::reference>::value, ""); // proxy reference
+    static_assert(is_same<decltype(it)::value_type, bool>::value, "");
+
+    auto it_tr_val = thrust::make_transform_iterator(it, flip_value{});
+    static_assert(is_same<decltype(it_tr_val)::reference, bool>::value, "");
+    static_assert(is_same<decltype(it_tr_val)::value_type, bool>::value, "");
+    (void) it_tr_val;
+
+    auto it_tr_ref = thrust::make_transform_iterator(it, pass_ref{});
+    static_assert(is_same<decltype(it_tr_ref)::reference, const bool&>::value, "");
+    static_assert(is_same<decltype(it_tr_ref)::value_type, bool>::value, "");
+    (void) it_tr_ref;
+
+    auto it_tr_fwd = thrust::make_transform_iterator(it, forward{});
+    static_assert(is_same<decltype(it_tr_fwd)::reference, bool&&>::value, ""); // proxy reference is decayed
+    static_assert(is_same<decltype(it_tr_fwd)::value_type, bool>::value, "");
+    (void) it_tr_fwd;
+  }
+}
+DECLARE_UNITTEST(TestTransformIteratorReferenceAndValueType);
diff --git a/thrust/testing/transform_scan.cu b/thrust/testing/transform_scan.cu
index d223827f16..66f005a5b9 100644
--- a/thrust/testing/transform_scan.cu
+++ b/thrust/testing/transform_scan.cu
@@ -26,6 +26,25 @@ void TestTransformInclusiveScanDispatchExplicit()
 }
 DECLARE_UNITTEST(TestTransformInclusiveScanDispatchExplicit);
 
+template <typename InputIterator, typename OutputIterator, typename UnaryFunction, typename T, typename AssociativeOperator>
+OutputIterator transform_inclusive_scan(
+  my_system& system, InputIterator, InputIterator, OutputIterator result, UnaryFunction, T, AssociativeOperator)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+void TestTransformInclusiveScanInitDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::transform_inclusive_scan(sys, vec.begin(), vec.begin(), vec.begin(), 0, 0, 0);
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestTransformInclusiveScanInitDispatchExplicit);
+
 template <typename InputIterator, typename OutputIterator, typename UnaryFunction, typename AssociativeOperator>
 OutputIterator transform_inclusive_scan(
   my_tag, InputIterator, InputIterator, OutputIterator result, UnaryFunction, AssociativeOperator)
@@ -90,26 +109,23 @@ void TestTransformScanSimple()
 
   typename Vector::iterator iter;
 
-  Vector input(5);
-  Vector result(5);
+  Vector input{1, 3, -2, 4, -5};
+  Vector result{-1, -4, -2, -6, -1};
   Vector output(5);
 
-  input[0] = 1;
-  input[1] = 3;
-  input[2] = -2;
-  input[3] = 4;
-  input[4] = -5;
-
   Vector input_copy(input);
 
   // inclusive scan
   iter = thrust::transform_inclusive_scan(
     input.begin(), input.end(), output.begin(), thrust::negate<T>(), thrust::plus<T>());
-  result[0] = -1;
-  result[1] = -4;
-  result[2] = -2;
-  result[3] = -6;
-  result[4] = -1;
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input, input_copy);
+  ASSERT_EQUAL(output, result);
+
+  // inclusive scan with 0 init
+  iter = thrust::transform_inclusive_scan(
+    input.begin(), input.end(), output.begin(), thrust::negate<T>(), 0, thrust::plus<T>());
+  result = {-1, -4, -2, -6, -1};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
@@ -117,11 +133,15 @@ void TestTransformScanSimple()
   // exclusive scan with 0 init
   iter = thrust::transform_exclusive_scan(
     input.begin(), input.end(), output.begin(), thrust::negate<T>(), 0, thrust::plus<T>());
-  result[0] = 0;
-  result[1] = -1;
-  result[2] = -4;
-  result[3] = -2;
-  result[4] = -6;
+  result = {0, -1, -4, -2, -6};
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
+  ASSERT_EQUAL(input, input_copy);
+  ASSERT_EQUAL(output, result);
+
+  // inclusive scan with nonzero init
+  iter = thrust::transform_inclusive_scan(
+    input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
+  result = {2, -1, 1, -3, 2};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
@@ -129,11 +149,7 @@ void TestTransformScanSimple()
   // exclusive scan with nonzero init
   iter = thrust::transform_exclusive_scan(
     input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
-  result[0] = 3;
-  result[1] = 2;
-  result[2] = -1;
-  result[3] = 1;
-  result[4] = -3;
+  result = {3, 2, -1, 1, -3};
   ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input, input_copy);
   ASSERT_EQUAL(output, result);
@@ -142,11 +158,15 @@ void TestTransformScanSimple()
   input = input_copy;
   iter =
     thrust::transform_inclusive_scan(input.begin(), input.end(), input.begin(), thrust::negate<T>(), thrust::plus<T>());
-  result[0] = -1;
-  result[1] = -4;
-  result[2] = -2;
-  result[3] = -6;
-  result[4] = -1;
+  result = {-1, -4, -2, -6, -1};
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
+  ASSERT_EQUAL(input, result);
+
+  // inplace inclusive scan with init
+  input = input_copy;
+  iter  = thrust::transform_inclusive_scan(
+    input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
+  result = {2, -1, 1, -3, 2};
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
@@ -154,11 +174,7 @@ void TestTransformScanSimple()
   input = input_copy;
   iter  = thrust::transform_exclusive_scan(
     input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
-  result[0] = 3;
-  result[1] = 2;
-  result[2] = -1;
-  result[3] = 1;
-  result[4] = -3;
+  result = {3, 2, -1, 1, -3};
   ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 }
@@ -200,25 +216,14 @@ void TestTransformInclusiveScanDifferentTypes()
 {
   typename thrust::host_vector<int>::iterator h_iter;
 
-  thrust::host_vector<Record> h_input(5);
+  thrust::host_vector<Record> h_input{{1}, {3}, {-2}, {4}, {-5}};
   thrust::host_vector<int> h_output(5);
-  thrust::host_vector<int> result(5);
-
-  h_input[0] = {1};
-  h_input[1] = {3};
-  h_input[2] = {-2};
-  h_input[3] = {4};
-  h_input[4] = {-5};
+  thrust::host_vector<int> result{-1, -4, -2, -6, -1};
 
   thrust::host_vector<Record> input_copy(h_input);
 
   h_iter =
     thrust::transform_inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), negate{}, thrust::plus<int>{});
-  result[0] = -1;
-  result[1] = -4;
-  result[2] = -2;
-  result[3] = -6;
-  result[4] = -1;
   ASSERT_EQUAL(std::size_t(h_iter - h_output.begin()), h_input.size());
   ASSERT_EQUAL(h_input, input_copy);
   ASSERT_EQUAL(h_output, result);
@@ -253,6 +258,12 @@ struct TestTransformScan
       d_input.begin(), d_input.end(), d_output.begin(), thrust::negate<T>(), thrust::plus<T>());
     ASSERT_EQUAL(d_output, h_output);
 
+    thrust::transform_inclusive_scan(
+      h_input.begin(), h_input.end(), h_output.begin(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+    thrust::transform_inclusive_scan(
+      d_input.begin(), d_input.end(), d_output.begin(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+    ASSERT_EQUAL(d_output, h_output);
+
     thrust::transform_exclusive_scan(
       h_input.begin(), h_input.end(), h_output.begin(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
     thrust::transform_exclusive_scan(
@@ -268,6 +279,12 @@ struct TestTransformScan
       d_output.begin(), d_output.end(), d_output.begin(), thrust::negate<T>(), thrust::plus<T>());
     ASSERT_EQUAL(d_output, h_output);
 
+    thrust::transform_inclusive_scan(
+      h_output.begin(), h_output.end(), h_output.begin(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+    thrust::transform_inclusive_scan(
+      d_output.begin(), d_output.end(), d_output.begin(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+    ASSERT_EQUAL(d_output, h_output);
+
     h_output = h_input;
     d_output = d_input;
     thrust::transform_exclusive_scan(
@@ -291,9 +308,8 @@ void TestTransformScanCountingIterator()
 
   thrust::transform_inclusive_scan(first, first + 3, result.begin(), thrust::negate<T>(), thrust::plus<T>());
 
-  ASSERT_EQUAL(result[0], -1);
-  ASSERT_EQUAL(result[1], -3);
-  ASSERT_EQUAL(result[2], -6);
+  Vector ref{-1, -3, -6};
+  ASSERT_EQUAL(result, ref);
 }
 DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformScanCountingIterator);
 
@@ -315,6 +331,12 @@ struct TestTransformScanToDiscardIterator
     ASSERT_EQUAL_QUIET(reference, h_result);
     ASSERT_EQUAL_QUIET(reference, d_result);
 
+    h_result = thrust::transform_inclusive_scan(
+      h_input.begin(), h_input.end(), thrust::make_discard_iterator(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+
+    d_result = thrust::transform_inclusive_scan(
+      d_input.begin(), d_input.end(), thrust::make_discard_iterator(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
+
     h_result = thrust::transform_exclusive_scan(
       h_input.begin(), h_input.end(), thrust::make_discard_iterator(), thrust::negate<T>(), (T) 11, thrust::plus<T>());
 
@@ -342,30 +364,19 @@ void TestValueCategoryDeduction()
   thrust::transform_inclusive_scan(
     thrust::device, vec.cbegin(), vec.cend(), vec.begin(), thrust::identity<>{}, thrust::maximum<>{});
 
-  ASSERT_EQUAL(T{5}, vec[0]);
-  ASSERT_EQUAL(T{5}, vec[1]);
-  ASSERT_EQUAL(T{5}, vec[2]);
-  ASSERT_EQUAL(T{8}, vec[3]);
-  ASSERT_EQUAL(T{8}, vec[4]);
-  ASSERT_EQUAL(T{8}, vec[5]);
-  ASSERT_EQUAL(T{8}, vec[6]);
-  ASSERT_EQUAL(T{8}, vec[7]);
-  ASSERT_EQUAL(T{8}, vec[8]);
-  ASSERT_EQUAL(T{9}, vec[9]);
+  ASSERT_EQUAL((thrust::device_vector<T>{5, 5, 5, 8, 8, 8, 8, 8, 8, 9}), vec);
+
+  vec.assign((T*) a_h, a_h + 10);
+
+  thrust::transform_inclusive_scan(
+    thrust::device, vec.cbegin(), vec.cend(), vec.begin(), thrust::identity<>{}, T{}, thrust::maximum<>{});
+
+  ASSERT_EQUAL((thrust::device_vector<T>{5, 5, 5, 8, 8, 8, 8, 8, 8, 9}), vec);
 
   vec.assign((T*) a_h, a_h + 10);
   thrust::transform_exclusive_scan(
     thrust::device, vec.cbegin(), vec.cend(), vec.begin(), thrust::identity<>{}, T{}, thrust::maximum<>{});
 
-  ASSERT_EQUAL(T{0}, vec[0]);
-  ASSERT_EQUAL(T{5}, vec[1]);
-  ASSERT_EQUAL(T{5}, vec[2]);
-  ASSERT_EQUAL(T{5}, vec[3]);
-  ASSERT_EQUAL(T{8}, vec[4]);
-  ASSERT_EQUAL(T{8}, vec[5]);
-  ASSERT_EQUAL(T{8}, vec[6]);
-  ASSERT_EQUAL(T{8}, vec[7]);
-  ASSERT_EQUAL(T{8}, vec[8]);
-  ASSERT_EQUAL(T{8}, vec[9]);
+  ASSERT_EQUAL((thrust::device_vector<T>{0, 5, 5, 5, 8, 8, 8, 8, 8, 8}), vec);
 }
 DECLARE_GENERIC_UNITTEST(TestValueCategoryDeduction);
diff --git a/thrust/testing/unittest/meta.h b/thrust/testing/unittest/meta.h
index 7fd90fa014..30cb835d55 100644
--- a/thrust/testing/unittest/meta.h
+++ b/thrust/testing/unittest/meta.h
@@ -157,4 +157,19 @@ struct transform2<type_list<T1s...>, type_list<T2s...>, Template>
   using type = type_list<typename ApplyTemplate2<Template, T1s, T2s>::type...>;
 };
 
+template <typename... Ls>
+struct concat;
+
+template <typename L>
+struct concat<L>
+{
+  using type = L;
+};
+
+template <template <typename...> class L, typename... T1s, typename... T2s, typename... Ls>
+struct concat<L<T1s...>, L<T2s...>, Ls...>
+{
+  using type = concat<L<T1s..., T2s...>, Ls...>;
+};
+
 } // namespace unittest
diff --git a/thrust/testing/vector_allocators.cu b/thrust/testing/vector_allocators.cu
index 7571688fea..57187c22ec 100644
--- a/thrust/testing/vector_allocators.cu
+++ b/thrust/testing/vector_allocators.cu
@@ -57,7 +57,7 @@ public:
     return base_traits::allocate(alloc, size);
   }
 
-  void deallocate(pointer ptr, std::size_t size)
+  void deallocate(pointer ptr, std::size_t size) noexcept
   {
     BaseAlloc alloc;
     last_deallocated = state;
@@ -70,7 +70,7 @@ public:
     return base_traits::construct(alloc, ptr);
   }
 
-  static void destroy(pointer ptr)
+  static void destroy(pointer ptr) noexcept
   {
     BaseAlloc alloc;
     return base_traits::destroy(alloc, ptr);
diff --git a/thrust/thrust/async/scan.h b/thrust/thrust/async/scan.h
index ca66455b43..fcbb41ad43 100644
--- a/thrust/thrust/async/scan.h
+++ b/thrust/thrust/async/scan.h
@@ -112,6 +112,27 @@ struct inclusive_scan_fn final
       THRUST_FWD(out),
       thrust::plus<>{}))
 
+      template <typename DerivedPolicy,
+                typename ForwardIt,
+                typename Sentinel,
+                typename OutputIt,
+                typename InitialValueType,
+                typename BinaryOp>
+      auto operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+                      ForwardIt&& first,
+                      Sentinel&& last,
+                      OutputIt&& out,
+                      InitialValueType&& init,
+                      BinaryOp&& op) const
+    // ADL dispatch.
+    THRUST_RETURNS(async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)))
+
       template <typename ForwardIt,
                 typename Sentinel,
                 typename OutputIt,
@@ -137,6 +158,23 @@ struct inclusive_scan_fn final
       THRUST_FWD(last),
       THRUST_FWD(out),
       thrust::plus<>{}))
+
+      template <typename ForwardIt,
+                typename Sentinel,
+                typename OutputIt,
+                typename InitialValueType,
+                typename BinaryOp,
+                typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+      auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out, InitialValueType&& init, BinaryOp&& op) const
+    // ADL dispatch.
+    THRUST_RETURNS(async_inclusive_scan(
+      thrust::detail::select_system(iterator_system_t<remove_cvref_t<ForwardIt>>{},
+                                    iterator_system_t<remove_cvref_t<OutputIt>>{}),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)))
 };
 
 } // namespace inclusive_scan_detail
diff --git a/thrust/thrust/cmake/thrust-config-version.cmake b/thrust/thrust/cmake/thrust-config-version.cmake
index 8c20d7d73b..fb59c479ce 100644
--- a/thrust/thrust/cmake/thrust-config-version.cmake
+++ b/thrust/thrust/cmake/thrust-config-version.cmake
@@ -2,7 +2,7 @@
 include("${CMAKE_CURRENT_LIST_DIR}/thrust-header-search.cmake")
 
 set(THRUST_VERSION_MAJOR 2)
-set(THRUST_VERSION_MINOR 6)
+set(THRUST_VERSION_MINOR 7)
 set(THRUST_VERSION_PATCH 0) # Thrust: "subminor" CMake: "patch"
 set(THRUST_VERSION_TWEAK 0)
 set(THRUST_VERSION "${THRUST_VERSION_MAJOR}.${THRUST_VERSION_MINOR}.${THRUST_VERSION_PATCH}.${THRUST_VERSION_TWEAK}")
diff --git a/thrust/thrust/complex.h b/thrust/thrust/complex.h
index 77811a0ecc..de261ba6d5 100644
--- a/thrust/thrust/complex.h
+++ b/thrust/thrust/complex.h
@@ -38,17 +38,11 @@
 #include <complex>
 #include <sstream>
 
-#if THRUST_CPP_DIALECT >= 2011
-#  define THRUST_STD_COMPLEX_REAL(z) \
-    reinterpret_cast<const typename ::cuda::std::__libcpp_remove_reference_t<decltype(z)>::value_type(&)[2]>(z)[0]
-#  define THRUST_STD_COMPLEX_IMAG(z) \
-    reinterpret_cast<const typename ::cuda::std::__libcpp_remove_reference_t<decltype(z)>::value_type(&)[2]>(z)[1]
-#  define THRUST_STD_COMPLEX_DEVICE _CCCL_DEVICE
-#else
-#  define THRUST_STD_COMPLEX_REAL(z) (z).real()
-#  define THRUST_STD_COMPLEX_IMAG(z) (z).imag()
-#  define THRUST_STD_COMPLEX_DEVICE
-#endif
+#define THRUST_STD_COMPLEX_REAL(z) \
+  reinterpret_cast<const typename ::cuda::std::__libcpp_remove_reference_t<decltype(z)>::value_type(&)[2]>(z)[0]
+#define THRUST_STD_COMPLEX_IMAG(z) \
+  reinterpret_cast<const typename ::cuda::std::__libcpp_remove_reference_t<decltype(z)>::value_type(&)[2]>(z)[1]
+#define THRUST_STD_COMPLEX_DEVICE _CCCL_DEVICE
 
 THRUST_NAMESPACE_BEGIN
 
@@ -67,100 +61,6 @@ THRUST_NAMESPACE_BEGIN
  *  \{
  */
 
-/*! \cond
- */
-
-namespace detail
-{
-
-template <typename T, std::size_t Align>
-struct complex_storage;
-
-#if THRUST_CPP_DIALECT >= 2011 && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION >= 40800)
-// C++11 implementation, excluding GCC 4.7, which doesn't have `alignas`.
-template <typename T, std::size_t Align>
-struct complex_storage
-{
-  struct alignas(Align) type
-  {
-    T x;
-    T y;
-  };
-};
-#elif (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) \
-  || ((THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40600))
-// C++03 implementation for MSVC and GCC <= 4.5.
-//
-// We have to implement `aligned_type` with specializations for MSVC
-// and GCC 4.2 and older because they require literals as arguments to
-// their alignment attribute.
-
-#  if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
-// MSVC implementation.
-#    define THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(X) \
-      template <typename T>                                 \
-      struct complex_storage<T, X>                          \
-      {                                                     \
-        __declspec(align(X)) struct type                    \
-        {                                                   \
-          T x;                                              \
-          T y;                                              \
-        };                                                  \
-      };                                                    \
-      /**/
-#  else
-// GCC <= 4.2 implementation.
-#    define THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(X) \
-      template <typename T>                                 \
-      struct complex_storage<T, X>                          \
-      {                                                     \
-        struct type                                         \
-        {                                                   \
-          T x;                                              \
-          T y;                                              \
-        } __attribute__((aligned(X)));                      \
-      };                                                    \
-      /**/
-#  endif
-
-// The primary template is a fallback, which doesn't specify any alignment.
-// It's only used when T is very large and we're using an older compilers
-// which we have to fully specialize each alignment case.
-template <typename T, std::size_t Align>
-struct complex_storage
-{
-  T x;
-  T y;
-};
-
-THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(1);
-THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(2);
-THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(4);
-THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(8);
-THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(16);
-THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(32);
-THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(64);
-THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(128);
-
-#  undef THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION
-#else
-// C++03 implementation for GCC > 4.5, Clang, PGI, ICPC, and xlC.
-template <typename T, std::size_t Align>
-struct complex_storage
-{
-  struct type
-  {
-    T x;
-    T y;
-  } __attribute__((aligned(Align)));
-};
-#endif
-
-} // end namespace detail
-
-/*! \endcond
- */
-
 /*! \p complex is the Thrust equivalent to <tt>std::complex</tt>. It is
  *  functionally identical to it, but can also be used in device code which
  *  <tt>std::complex</tt> currently cannot.
@@ -192,7 +92,6 @@ struct complex
    */
   _CCCL_HOST_DEVICE complex(const T& re, const T& im);
 
-#if THRUST_CPP_DIALECT >= 2011
   /*! Default construct a complex number.
    */
   complex() = default;
@@ -203,18 +102,6 @@ struct complex
    *  \param z The \p complex to copy from.
    */
   complex(const complex<T>& z) = default;
-#else
-  /*! Default construct a complex number.
-   */
-  _CCCL_HOST_DEVICE complex();
-
-  /*! This copy constructor copies from a \p complex with a type that is
-   *  convertible to this \p complex's \c value_type.
-   *
-   *  \param z The \p complex to copy from.
-   */
-  _CCCL_HOST_DEVICE complex(const complex<T>& z);
-#endif
 
   /*! This converting copy constructor copies from a \p complex with a type
    *  that is convertible to this \p complex's \c value_type.
@@ -252,21 +139,12 @@ struct complex
    */
   _CCCL_HOST_DEVICE complex& operator=(const T& re);
 
-#if THRUST_CPP_DIALECT >= 2011
   /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
    *  \p complex respectively.
    *
    *  \param z The \p complex to copy from.
    */
   complex& operator=(const complex<T>& z) = default;
-#else
-  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
-   *  \p complex respectively.
-   *
-   *  \param z The \p complex to copy from.
-   */
-  _CCCL_HOST_DEVICE complex& operator=(const complex<T>& z);
-#endif
 
   /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
    *  \p complex respectively.
@@ -461,7 +339,22 @@ struct complex
   }
 
 private:
-  typename detail::complex_storage<T, sizeof(T) * 2>::type data;
+#if defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_CUDACC_VER < 1107000
+  struct __align__(sizeof(T) * 2) storage
+#elif defined(_CCCL_COMPILER_ICC)
+  struct storage
+#else // !(defined(_CCCL_COMPILER_ICC) || (defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_CUDACC_VER < 1107000))
+  struct alignas(sizeof(T) * 2) storage
+#endif // !(defined(_CCCL_COMPILER_ICC) || (defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_CUDACC_VER < 1107000))
+  {
+    T x;
+    T y;
+  }
+#ifdef _CCCL_COMPILER_ICC
+  __attribute__((aligned(sizeof(T) * 2)))
+#endif // _CCCL_COMPILER_ICC
+  ;
+  storage data;
 };
 
 /* --- General Functions --- */
diff --git a/thrust/thrust/detail/allocator/allocator_traits.h b/thrust/thrust/detail/allocator/allocator_traits.h
index 232cc972cd..4a87333c22 100644
--- a/thrust/thrust/detail/allocator/allocator_traits.h
+++ b/thrust/thrust/detail/allocator/allocator_traits.h
@@ -300,7 +300,7 @@ struct allocator_traits
 
   inline _CCCL_HOST_DEVICE static pointer allocate(allocator_type& a, size_type n, const_void_pointer hint);
 
-  inline _CCCL_HOST_DEVICE static void deallocate(allocator_type& a, pointer p, size_type n);
+  inline _CCCL_HOST_DEVICE static void deallocate(allocator_type& a, pointer p, size_type n) noexcept;
 
   // XXX should probably change T* to pointer below and then relax later
 
@@ -314,7 +314,7 @@ struct allocator_traits
   inline _CCCL_HOST_DEVICE static void construct(allocator_type& a, T* p, Args&&... args);
 
   template <typename T>
-  inline _CCCL_HOST_DEVICE static void destroy(allocator_type& a, T* p);
+  inline _CCCL_HOST_DEVICE static void destroy(allocator_type& a, T* p) noexcept;
 
   inline _CCCL_HOST_DEVICE static size_type max_size(const allocator_type& a);
 }; // end allocator_traits
diff --git a/thrust/thrust/detail/allocator/allocator_traits.inl b/thrust/thrust/detail/allocator/allocator_traits.inl
index dad8276e82..2af11eddfd 100644
--- a/thrust/thrust/detail/allocator/allocator_traits.inl
+++ b/thrust/thrust/detail/allocator/allocator_traits.inl
@@ -94,7 +94,7 @@ public:
   }
 
   _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_HOST_DEVICE static void deallocate(allocator_type& a, pointer p, size_type n)
+  _CCCL_HOST_DEVICE static void deallocate(allocator_type& a, pointer p, size_type n) noexcept
   {
     superclass::deallocate(a, p, n);
   }
@@ -108,7 +108,7 @@ public:
 
   _CCCL_EXEC_CHECK_DISABLE
   template <typename U>
-  _CCCL_HOST_DEVICE static void destroy(allocator_type& a, U* p)
+  _CCCL_HOST_DEVICE static void destroy(allocator_type& a, U* p) noexcept
   {
     superclass::destroy(a, p);
   }
@@ -312,13 +312,13 @@ _CCCL_HOST_DEVICE typename allocator_traits<Alloc>::pointer allocator_traits<All
 
 template <typename Alloc>
 _CCCL_HOST_DEVICE void allocator_traits<Alloc>::deallocate(
-  Alloc& a, typename allocator_traits<Alloc>::pointer p, typename allocator_traits<Alloc>::size_type n)
+  Alloc& a, typename allocator_traits<Alloc>::pointer p, typename allocator_traits<Alloc>::size_type n) noexcept
 {
   struct workaround_warnings
   {
     _CCCL_EXEC_CHECK_DISABLE
-    static _CCCL_HOST_DEVICE void
-    deallocate(Alloc& a, typename allocator_traits<Alloc>::pointer p, typename allocator_traits<Alloc>::size_type n)
+    static _CCCL_HOST_DEVICE void deallocate(
+      Alloc& a, typename allocator_traits<Alloc>::pointer p, typename allocator_traits<Alloc>::size_type n) noexcept
     {
       return a.deallocate(p, n);
     }
@@ -350,7 +350,7 @@ _CCCL_HOST_DEVICE void allocator_traits<Alloc>::construct(allocator_type& a, T*
 
 template <typename Alloc>
 template <typename T>
-_CCCL_HOST_DEVICE void allocator_traits<Alloc>::destroy(allocator_type& a, T* p)
+_CCCL_HOST_DEVICE void allocator_traits<Alloc>::destroy(allocator_type& a, T* p) noexcept
 {
   return allocator_traits_detail::destroy(a, p);
 }
diff --git a/thrust/thrust/detail/allocator/destroy_range.h b/thrust/thrust/detail/allocator/destroy_range.h
index 806abc43c2..2d7d43a472 100644
--- a/thrust/thrust/detail/allocator/destroy_range.h
+++ b/thrust/thrust/detail/allocator/destroy_range.h
@@ -31,7 +31,7 @@ namespace detail
 {
 
 template <typename Allocator, typename Pointer, typename Size>
-_CCCL_HOST_DEVICE inline void destroy_range(Allocator& a, Pointer p, Size n);
+_CCCL_HOST_DEVICE inline void destroy_range(Allocator& a, Pointer p, Size n) noexcept;
 
 } // namespace detail
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/allocator/destroy_range.inl b/thrust/thrust/detail/allocator/destroy_range.inl
index 158c6c16cd..e6ad2204b1 100644
--- a/thrust/thrust/detail/allocator/destroy_range.inl
+++ b/thrust/thrust/detail/allocator/destroy_range.inl
@@ -79,12 +79,12 @@ struct destroy_via_allocator
 {
   Allocator& a;
 
-  _CCCL_HOST_DEVICE destroy_via_allocator(Allocator& a)
+  _CCCL_HOST_DEVICE destroy_via_allocator(Allocator& a) noexcept
       : a(a)
   {}
 
   template <typename T>
-  inline _CCCL_HOST_DEVICE void operator()(T& x)
+  inline _CCCL_HOST_DEVICE void operator()(T& x) noexcept
   {
     allocator_traits<Allocator>::destroy(a, &x);
   }
@@ -93,7 +93,7 @@ struct destroy_via_allocator
 // destroy_range case 1: destroy via allocator
 template <typename Allocator, typename Pointer, typename Size>
 _CCCL_HOST_DEVICE typename enable_if_destroy_range_case1<Allocator, Pointer>::type
-destroy_range(Allocator& a, Pointer p, Size n)
+destroy_range(Allocator& a, Pointer p, Size n) noexcept
 {
   thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, destroy_via_allocator<Allocator>(a));
 }
@@ -103,7 +103,7 @@ struct gozer
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <typename T>
-  inline _CCCL_HOST_DEVICE void operator()(T& x)
+  inline _CCCL_HOST_DEVICE void operator()(T& x) noexcept
   {
     x.~T();
   }
@@ -112,7 +112,7 @@ struct gozer
 // destroy_range case 2: destroy without the allocator
 template <typename Allocator, typename Pointer, typename Size>
 _CCCL_HOST_DEVICE typename enable_if_destroy_range_case2<Allocator, Pointer>::type
-destroy_range(Allocator& a, Pointer p, Size n)
+destroy_range(Allocator& a, Pointer p, Size n) noexcept
 {
   thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, gozer());
 }
@@ -120,7 +120,7 @@ destroy_range(Allocator& a, Pointer p, Size n)
 // destroy_range case 3: no-op
 template <typename Allocator, typename Pointer, typename Size>
 _CCCL_HOST_DEVICE typename enable_if_destroy_range_case3<Allocator, Pointer>::type
-destroy_range(Allocator&, Pointer, Size)
+destroy_range(Allocator&, Pointer, Size) noexcept
 {
   // no op
 }
@@ -128,7 +128,7 @@ destroy_range(Allocator&, Pointer, Size)
 } // namespace allocator_traits_detail
 
 template <typename Allocator, typename Pointer, typename Size>
-_CCCL_HOST_DEVICE void destroy_range(Allocator& a, Pointer p, Size n)
+_CCCL_HOST_DEVICE void destroy_range(Allocator& a, Pointer p, Size n) noexcept
 {
   return allocator_traits_detail::destroy_range(a, p, n);
 }
diff --git a/thrust/thrust/detail/allocator/malloc_allocator.h b/thrust/thrust/detail/allocator/malloc_allocator.h
index abe26bd3d9..d0896c18dd 100644
--- a/thrust/thrust/detail/allocator/malloc_allocator.h
+++ b/thrust/thrust/detail/allocator/malloc_allocator.h
@@ -44,7 +44,7 @@ class malloc_allocator : public thrust::detail::tagged_allocator<T, System, Poin
 
   pointer allocate(size_type cnt);
 
-  void deallocate(pointer p, size_type n);
+  void deallocate(pointer p, size_type n) noexcept;
 };
 
 } // namespace detail
diff --git a/thrust/thrust/detail/allocator/malloc_allocator.inl b/thrust/thrust/detail/allocator/malloc_allocator.inl
index fd3e42816c..91eda7b4cc 100644
--- a/thrust/thrust/detail/allocator/malloc_allocator.inl
+++ b/thrust/thrust/detail/allocator/malloc_allocator.inl
@@ -55,8 +55,8 @@ malloc_allocator<T, System, Pointer>::allocate(typename malloc_allocator<T, Syst
 } // end malloc_allocator::allocate()
 
 template <typename T, typename System, typename Pointer>
-void malloc_allocator<T, System, Pointer>::deallocate(
-  typename malloc_allocator<T, System, Pointer>::pointer p, typename malloc_allocator<T, System, Pointer>::size_type)
+void malloc_allocator<T, System, Pointer>::deallocate(typename malloc_allocator<T, System, Pointer>::pointer p,
+                                                      typename malloc_allocator<T, System, Pointer>::size_type) noexcept
 {
   using thrust::system::detail::generic::select_system;
 
diff --git a/thrust/thrust/detail/allocator/no_throw_allocator.h b/thrust/thrust/detail/allocator/no_throw_allocator.h
index 098382d9b8..87c3f3cca8 100644
--- a/thrust/thrust/detail/allocator/no_throw_allocator.h
+++ b/thrust/thrust/detail/allocator/no_throw_allocator.h
@@ -49,7 +49,7 @@ struct no_throw_allocator : BaseAllocator
     using other = no_throw_allocator<typename super_t::template rebind<U>::other>;
   }; // end rebind
 
-  _CCCL_HOST_DEVICE void deallocate(typename super_t::pointer p, typename super_t::size_type n)
+  _CCCL_HOST_DEVICE void deallocate(typename super_t::pointer p, typename super_t::size_type n) noexcept
   {
     NV_IF_TARGET(
       NV_IS_HOST,
diff --git a/thrust/thrust/detail/allocator/temporary_allocator.h b/thrust/thrust/detail/allocator/temporary_allocator.h
index 03c6fae3e7..db1b215e59 100644
--- a/thrust/thrust/detail/allocator/temporary_allocator.h
+++ b/thrust/thrust/detail/allocator/temporary_allocator.h
@@ -62,7 +62,7 @@ class temporary_allocator : public thrust::detail::tagged_allocator<T, System, t
 
   _CCCL_HOST_DEVICE pointer allocate(size_type cnt);
 
-  _CCCL_HOST_DEVICE void deallocate(pointer p, size_type n);
+  _CCCL_HOST_DEVICE void deallocate(pointer p, size_type n) noexcept;
 
   _CCCL_HOST_DEVICE inline System& system()
   {
diff --git a/thrust/thrust/detail/allocator/temporary_allocator.inl b/thrust/thrust/detail/allocator/temporary_allocator.inl
index 9ba3b6d285..8da39c4135 100644
--- a/thrust/thrust/detail/allocator/temporary_allocator.inl
+++ b/thrust/thrust/detail/allocator/temporary_allocator.inl
@@ -72,7 +72,7 @@ temporary_allocator<T, System>::allocate(typename temporary_allocator<T, System>
 
 template <typename T, typename System>
 _CCCL_HOST_DEVICE void temporary_allocator<T, System>::deallocate(
-  typename temporary_allocator<T, System>::pointer p, typename temporary_allocator<T, System>::size_type n)
+  typename temporary_allocator<T, System>::pointer p, typename temporary_allocator<T, System>::size_type n) noexcept
 {
   return thrust::return_temporary_buffer(system(), p, n);
 } // end temporary_allocator
diff --git a/thrust/thrust/detail/complex/catrig.h b/thrust/thrust/detail/complex/catrig.h
index 78ece77dce..c3c42b7f9a 100644
--- a/thrust/thrust/detail/complex/catrig.h
+++ b/thrust/thrust/detail/complex/catrig.h
@@ -671,7 +671,6 @@ _CCCL_HOST_DEVICE inline double real_part_reciprocal(double x, double y)
  * Re(catanh(z)) = x/|z|^2 + O(x/z^4)
  *    as z -> infinity, uniformly in x
  */
-#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 _CCCL_HOST_DEVICE inline complex<double> catanh(complex<double> z)
 {
   double x, y, ax, ay, rx, ry;
@@ -769,8 +768,6 @@ _CCCL_HOST_DEVICE inline complex<double> catan(complex<double> z)
   return (complex<double>(w.imag(), w.real()));
 }
 
-#endif
-
 } // namespace complex
 
 } // namespace detail
@@ -851,13 +848,11 @@ _CCCL_HOST_DEVICE inline complex<double> asin(const complex<double>& z)
   return detail::complex::casin(z);
 }
 
-#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 _CCCL_HOST_DEVICE inline complex<double> atan(const complex<double>& z)
 {
   return detail::complex::catan(z);
 }
-#endif
 
 template <>
 _CCCL_HOST_DEVICE inline complex<double> acosh(const complex<double>& z)
@@ -871,12 +866,10 @@ _CCCL_HOST_DEVICE inline complex<double> asinh(const complex<double>& z)
   return detail::complex::casinh(z);
 }
 
-#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 _CCCL_HOST_DEVICE inline complex<double> atanh(const complex<double>& z)
 {
   return detail::complex::catanh(z);
 }
-#endif
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/complex/catrigf.h b/thrust/thrust/detail/complex/catrigf.h
index 60dd6fa5b4..b30cb08a43 100644
--- a/thrust/thrust/detail/complex/catrigf.h
+++ b/thrust/thrust/detail/complex/catrigf.h
@@ -466,7 +466,6 @@ _CCCL_HOST_DEVICE inline float real_part_reciprocal(float x, float y)
   return (x / (x * x + y * y) * scale);
 }
 
-#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 _CCCL_HOST_DEVICE inline complex<float> catanhf(complex<float> z)
 {
   float x, y, ax, ay, rx, ry;
@@ -545,7 +544,6 @@ _CCCL_HOST_DEVICE inline complex<float> catanf(complex<float> z)
   complex<float> w = catanhf(complex<float>(z.imag(), z.real()));
   return (complex<float>(w.imag(), w.real()));
 }
-#endif
 
 } // namespace complex
 
@@ -563,13 +561,11 @@ _CCCL_HOST_DEVICE inline complex<float> asin(const complex<float>& z)
   return detail::complex::casinf(z);
 }
 
-#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 _CCCL_HOST_DEVICE inline complex<float> atan(const complex<float>& z)
 {
   return detail::complex::catanf(z);
 }
-#endif
 
 template <>
 _CCCL_HOST_DEVICE inline complex<float> acosh(const complex<float>& z)
@@ -583,12 +579,10 @@ _CCCL_HOST_DEVICE inline complex<float> asinh(const complex<float>& z)
   return detail::complex::casinhf(z);
 }
 
-#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 _CCCL_HOST_DEVICE inline complex<float> atanh(const complex<float>& z)
 {
   return detail::complex::catanhf(z);
 }
-#endif
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/config.h b/thrust/thrust/detail/config.h
index 5f7da5b558..e7b3f5222e 100644
--- a/thrust/thrust/detail/config.h
+++ b/thrust/thrust/detail/config.h
@@ -19,8 +19,8 @@
 
 #pragma once
 
-#include <thrust/detail/config/config.h>
-#include <thrust/version.h>
+#include <thrust/detail/config/config.h> // IWYU pragma: export
+#include <thrust/version.h> // IWYU pragma: export
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
diff --git a/thrust/thrust/detail/config/compiler.h b/thrust/thrust/detail/config/compiler.h
index 6723f21c02..cac84a63cf 100644
--- a/thrust/thrust/detail/config/compiler.h
+++ b/thrust/thrust/detail/config/compiler.h
@@ -31,56 +31,83 @@
 #endif // no system header
 
 // enumerate host compilers we know about
+//! deprecated [Since 2.7]
 #define THRUST_HOST_COMPILER_UNKNOWN 0
-#define THRUST_HOST_COMPILER_MSVC    1
-#define THRUST_HOST_COMPILER_GCC     2
-#define THRUST_HOST_COMPILER_CLANG   3
-#define THRUST_HOST_COMPILER_INTEL   4
+//! deprecated [Since 2.7]
+#define THRUST_HOST_COMPILER_MSVC 1
+//! deprecated [Since 2.7]
+#define THRUST_HOST_COMPILER_GCC 2
+//! deprecated [Since 2.7]
+#define THRUST_HOST_COMPILER_CLANG 3
+//! deprecated [Since 2.7]
+#define THRUST_HOST_COMPILER_INTEL 4
 
 // enumerate device compilers we know about
+//! deprecated [Since 2.7]
 #define THRUST_DEVICE_COMPILER_UNKNOWN 0
-#define THRUST_DEVICE_COMPILER_MSVC    1
-#define THRUST_DEVICE_COMPILER_GCC     2
-#define THRUST_DEVICE_COMPILER_CLANG   3
-#define THRUST_DEVICE_COMPILER_NVCC    4
+//! deprecated [Since 2.7]
+#define THRUST_DEVICE_COMPILER_MSVC 1
+//! deprecated [Since 2.7]
+#define THRUST_DEVICE_COMPILER_GCC 2
+//! deprecated [Since 2.7]
+#define THRUST_DEVICE_COMPILER_CLANG 3
+//! deprecated [Since 2.7]
+#define THRUST_DEVICE_COMPILER_NVCC 4
 
 // figure out which host compiler we're using
 #if defined(_CCCL_COMPILER_MSVC)
-#  define THRUST_HOST_COMPILER     THRUST_HOST_COMPILER_MSVC
-#  define THRUST_MSVC_VERSION      _MSC_VER
-#  define THRUST_MSVC_VERSION_FULL _MSC_FULL_VER
+//! deprecated [Since 2.7]
+#  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_MSVC
+//! deprecated [Since 2.7]
+#  define THRUST_MSVC_VERSION _CCCL_MSVC_VERSION
+//! deprecated [Since 2.7]
+#  define THRUST_MSVC_VERSION_FULL _CCCL_MSVC_VERSION_FULL
 #elif defined(_CCCL_COMPILER_ICC)
+//! deprecated [Since 2.7]
 #  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_INTEL
 #elif defined(_CCCL_COMPILER_CLANG)
+//! deprecated [Since 2.7]
 #  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_CLANG
-#  define THRUST_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+//! deprecated [Since 2.7]
+#  define THRUST_CLANG_VERSION _CCCL_CLANG_VERSION
 #elif defined(_CCCL_COMPILER_GCC)
+//! deprecated [Since 2.7]
 #  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_GCC
-#  define THRUST_GCC_VERSION   (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+//! deprecated [Since 2.7]
+#  define THRUST_GCC_VERSION   _CCCL_GCC_VERSION
 #  if (THRUST_GCC_VERSION >= 50000)
+//! deprecated [Since 2.7]
 #    define THRUST_MODERN_GCC
 #  else
+//! deprecated [Since 2.7]
 #    define THRUST_LEGACY_GCC
 #  endif
 #else
+//! deprecated [Since 2.7]
 #  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_UNKNOWN
 #endif // TRUST_HOST_COMPILER
 
 // figure out which device compiler we're using
 #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+//! deprecated [Since 2.7]
 #  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
 #elif defined(_CCCL_COMPILER_MSVC)
+//! deprecated [Since 2.7]
 #  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC
 #elif defined(_CCCL_COMPILER_GCC)
+//! deprecated [Since 2.7]
 #  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_GCC
 #elif defined(_CCCL_COMPILER_CLANG)
 // CUDA-capable clang should behave similar to NVCC.
 #  if defined(__CUDA__)
+//! deprecated [Since 2.7]
 #    define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
 #  else
+//! deprecated [Since 2.7]
 #    define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_CLANG
 #  endif
 #else
+//! deprecated [Since 2.7]
 #  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_UNKNOWN
 #endif
 
diff --git a/thrust/thrust/detail/config/compiler_fence.h b/thrust/thrust/detail/config/compiler_fence.h
index 63ac1bd0ac..e65db0bc84 100644
--- a/thrust/thrust/detail/config/compiler_fence.h
+++ b/thrust/thrust/detail/config/compiler_fence.h
@@ -68,7 +68,7 @@
 // unknown case
 #elif defined(_CCCL_COMPILER_CLANG)
 #  define __thrust_compiler_fence() __sync_synchronize()
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_UNKNOWN
+#else
 
 // allow the code to compile without any guarantees
 #  define __thrust_compiler_fence() \
diff --git a/thrust/thrust/detail/config/config.h b/thrust/thrust/detail/config/config.h
index f45561486e..5485c755b4 100644
--- a/thrust/thrust/detail/config/config.h
+++ b/thrust/thrust/detail/config/config.h
@@ -21,7 +21,7 @@
 #pragma once
 
 // For _CCCL_IMPLICIT_SYSTEM_HEADER
-#include <cuda/__cccl_config>
+#include <cuda/__cccl_config> // IWYU pragma: export
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -33,14 +33,14 @@
 
 // NOTE: The order of these #includes matters.
 
-#include <thrust/detail/config/compiler.h>
-#include <thrust/detail/config/cpp_compatibility.h>
-#include <thrust/detail/config/cpp_dialect.h>
-#include <thrust/detail/config/deprecated.h>
-#include <thrust/detail/config/simple_defines.h>
+#include <thrust/detail/config/compiler.h> // IWYU pragma: export
+#include <thrust/detail/config/cpp_compatibility.h> // IWYU pragma: export
+#include <thrust/detail/config/cpp_dialect.h> // IWYU pragma: export
+#include <thrust/detail/config/deprecated.h> // IWYU pragma: export
+#include <thrust/detail/config/simple_defines.h> // IWYU pragma: export
 // host_system.h & device_system.h must be #included as early as possible because other config headers depend on it
-#include <thrust/detail/config/host_system.h>
+#include <thrust/detail/config/host_system.h> // IWYU pragma: export
 
-#include <thrust/detail/config/device_system.h>
-#include <thrust/detail/config/global_workarounds.h>
-#include <thrust/detail/config/namespace.h>
+#include <thrust/detail/config/device_system.h> // IWYU pragma: export
+#include <thrust/detail/config/global_workarounds.h> // IWYU pragma: export
+#include <thrust/detail/config/namespace.h> // IWYU pragma: export
diff --git a/thrust/thrust/detail/config/cpp_compatibility.h b/thrust/thrust/detail/config/cpp_compatibility.h
index eaff148540..7d05821fa6 100644
--- a/thrust/thrust/detail/config/cpp_compatibility.h
+++ b/thrust/thrust/detail/config/cpp_compatibility.h
@@ -26,7 +26,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <thrust/detail/config/cpp_dialect.h>
+#include <thrust/detail/config/cpp_dialect.h> // IWYU pragma: export
 
 #include <cuda/std/cstddef>
 
diff --git a/thrust/thrust/detail/config/cpp_dialect.h b/thrust/thrust/detail/config/cpp_dialect.h
index 9b63ff28d3..02a40deec1 100644
--- a/thrust/thrust/detail/config/cpp_dialect.h
+++ b/thrust/thrust/detail/config/cpp_dialect.h
@@ -30,7 +30,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <thrust/detail/config/compiler.h>
+#include <thrust/detail/config/compiler.h> // IWYU pragma: export
 
 // Deprecation warnings may be silenced by defining the following macros. These
 // may be combined.
diff --git a/thrust/thrust/detail/contiguous_storage.h b/thrust/thrust/detail/contiguous_storage.h
index db80694eb2..41f3f5f026 100644
--- a/thrust/thrust/detail/contiguous_storage.h
+++ b/thrust/thrust/detail/contiguous_storage.h
@@ -98,7 +98,7 @@ class contiguous_storage
   // note that allocate does *not* automatically call deallocate
   _CCCL_HOST_DEVICE void allocate(size_type n);
 
-  _CCCL_HOST_DEVICE void deallocate();
+  _CCCL_HOST_DEVICE void deallocate() noexcept;
 
   _CCCL_HOST_DEVICE void swap(contiguous_storage& x);
 
@@ -120,11 +120,12 @@ class contiguous_storage
   _CCCL_HOST_DEVICE iterator
   uninitialized_copy_n(thrust::execution_policy<System>& from_system, InputIterator first, Size n, iterator result);
 
-  _CCCL_HOST_DEVICE void destroy(iterator first, iterator last);
+  _CCCL_HOST_DEVICE void destroy(iterator first, iterator last) noexcept;
 
-  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch(const contiguous_storage& other);
+  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch(const contiguous_storage& other) noexcept;
 
-  _CCCL_HOST_DEVICE void destroy_on_allocator_mismatch(const contiguous_storage& other, iterator first, iterator last);
+  _CCCL_HOST_DEVICE void
+  destroy_on_allocator_mismatch(const contiguous_storage& other, iterator first, iterator last) noexcept;
 
   _CCCL_HOST_DEVICE void set_allocator(const allocator_type& alloc);
 
@@ -161,15 +162,15 @@ class contiguous_storage
 
   _CCCL_HOST_DEVICE bool is_allocator_not_equal_dispatch(false_type, const allocator_type&) const;
 
-  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch_dispatch(true_type, const contiguous_storage& other);
+  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch_dispatch(true_type, const contiguous_storage& other) noexcept;
 
-  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage& other);
+  _CCCL_HOST_DEVICE void deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage& other) noexcept;
 
-  _CCCL_HOST_DEVICE void
-  destroy_on_allocator_mismatch_dispatch(true_type, const contiguous_storage& other, iterator first, iterator last);
+  _CCCL_HOST_DEVICE void destroy_on_allocator_mismatch_dispatch(
+    true_type, const contiguous_storage& other, iterator first, iterator last) noexcept;
 
-  _CCCL_HOST_DEVICE void
-  destroy_on_allocator_mismatch_dispatch(false_type, const contiguous_storage& other, iterator first, iterator last);
+  _CCCL_HOST_DEVICE void destroy_on_allocator_mismatch_dispatch(
+    false_type, const contiguous_storage& other, iterator first, iterator last) noexcept;
 
   _CCCL_HOST_DEVICE void propagate_allocator_dispatch(true_type, const contiguous_storage& other);
 
diff --git a/thrust/thrust/detail/contiguous_storage.inl b/thrust/thrust/detail/contiguous_storage.inl
index 3994bdd206..d886f05efb 100644
--- a/thrust/thrust/detail/contiguous_storage.inl
+++ b/thrust/thrust/detail/contiguous_storage.inl
@@ -180,7 +180,7 @@ _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::allocate(size_type n)
 } // end contiguous_storage::allocate()
 
 template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::deallocate()
+_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::deallocate() noexcept
 {
   if (size() > 0)
   {
@@ -254,13 +254,14 @@ contiguous_storage<T, Alloc>::uninitialized_copy_n(InputIterator first, Size n,
 } // end contiguous_storage::uninitialized_copy_n()
 
 template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy(iterator first, iterator last)
+_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy(iterator first, iterator last) noexcept
 {
   destroy_range(m_allocator, first.base(), last - first);
 } // end contiguous_storage::destroy()
 
 template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch(const contiguous_storage& other)
+_CCCL_HOST_DEVICE void
+contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch(const contiguous_storage& other) noexcept
 {
   integral_constant<bool, allocator_traits<Alloc>::propagate_on_container_copy_assignment::value> c;
 
@@ -269,7 +270,7 @@ _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::deallocate_on_allocator_mis
 
 template <typename T, typename Alloc>
 _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy_on_allocator_mismatch(
-  const contiguous_storage& other, iterator first, iterator last)
+  const contiguous_storage& other, iterator first, iterator last) noexcept
 {
   integral_constant<bool, allocator_traits<Alloc>::propagate_on_container_copy_assignment::value> c;
 
@@ -363,8 +364,8 @@ contiguous_storage<T, Alloc>::is_allocator_not_equal_dispatch(false_type /*!is_a
 
 _CCCL_EXEC_CHECK_DISABLE
 template <typename T, typename Alloc>
-_CCCL_HOST_DEVICE void
-contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch_dispatch(true_type, const contiguous_storage& other)
+_CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch_dispatch(
+  true_type, const contiguous_storage& other) noexcept
 {
   if (m_allocator != other.m_allocator)
   {
@@ -374,13 +375,13 @@ contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch_dispatch(true_typ
 
 template <typename T, typename Alloc>
 _CCCL_HOST_DEVICE void
-contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage&)
+contiguous_storage<T, Alloc>::deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage&) noexcept
 {} // end contiguous_storage::deallocate_on_allocator_mismatch()
 
 _CCCL_EXEC_CHECK_DISABLE
 template <typename T, typename Alloc>
 _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy_on_allocator_mismatch_dispatch(
-  true_type, const contiguous_storage& other, iterator first, iterator last)
+  true_type, const contiguous_storage& other, iterator first, iterator last) noexcept
 {
   if (m_allocator != other.m_allocator)
   {
@@ -390,7 +391,7 @@ _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy_on_allocator_mismat
 
 template <typename T, typename Alloc>
 _CCCL_HOST_DEVICE void contiguous_storage<T, Alloc>::destroy_on_allocator_mismatch_dispatch(
-  false_type, const contiguous_storage&, iterator, iterator)
+  false_type, const contiguous_storage&, iterator, iterator) noexcept
 {} // end contiguous_storage::destroy_on_allocator_mismatch()
 
 _CCCL_EXEC_CHECK_DISABLE
diff --git a/thrust/thrust/detail/functional/actor.h b/thrust/thrust/detail/functional/actor.h
index 79484aabbe..bca5dffbda 100644
--- a/thrust/thrust/detail/functional/actor.h
+++ b/thrust/thrust/detail/functional/actor.h
@@ -36,7 +36,6 @@
 #endif // no system header
 #include <thrust/detail/type_deduction.h>
 #include <thrust/detail/type_traits/result_of_adaptable_function.h>
-#include <thrust/functional.h>
 #include <thrust/tuple.h>
 
 #include <cuda/std/type_traits>
@@ -84,9 +83,9 @@ struct argument
 {
   template <typename... Ts>
   _CCCL_HOST_DEVICE auto
-  eval(Ts&&... args) const -> decltype(thrust::get<Pos>(thrust::tuple<Ts...>{THRUST_FWD(args)...}))
+  eval(Ts&&... args) const -> decltype(thrust::get<Pos>(thrust::tuple<Ts&&...>{THRUST_FWD(args)...}))
   {
-    return thrust::get<Pos>(thrust::tuple<Ts...>{THRUST_FWD(args)...});
+    return thrust::get<Pos>(thrust::tuple<Ts&&...>{THRUST_FWD(args)...});
   }
 };
 
@@ -103,6 +102,8 @@ struct composite;
 template <typename Eval, typename SubExpr>
 struct composite<Eval, SubExpr>
 {
+  constexpr composite() = default;
+
   // TODO(bgruber): drop ctor and use aggregate initialization in C++17
   _CCCL_HOST_DEVICE composite(const Eval& eval, const SubExpr& subexpr)
       : m_eval(eval)
@@ -124,6 +125,8 @@ struct composite<Eval, SubExpr>
 template <typename Eval, typename SubExpr1, typename SubExpr2>
 struct composite<Eval, SubExpr1, SubExpr2>
 {
+  constexpr composite() = default;
+
   // TODO(bgruber): drop ctor and use aggregate initialization in C++17
   _CCCL_HOST_DEVICE composite(const Eval& eval, const SubExpr1& subexpr1, const SubExpr2& subexpr2)
       : m_eval(eval)
@@ -152,6 +155,8 @@ struct actor;
 template <typename F>
 struct operator_adaptor : F
 {
+  constexpr operator_adaptor() = default;
+
   _CCCL_HOST_DEVICE operator_adaptor(F f)
       : F(::cuda::std::move(f))
   {}
diff --git a/thrust/thrust/detail/integer_math.h b/thrust/thrust/detail/integer_math.h
index ab37d9a3a9..730b084767 100644
--- a/thrust/thrust/detail/integer_math.h
+++ b/thrust/thrust/detail/integer_math.h
@@ -27,6 +27,8 @@
 #endif // no system header
 #include <thrust/detail/type_deduction.h>
 
+#include <cuda/std/type_traits>
+
 #include <limits>
 
 #include <nv/target>
@@ -60,6 +62,18 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
   return 0 == (x & (x - 1));
 }
 
+template <typename T>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if<std::is_signed<T>::value, bool>::type is_negative(T x)
+{
+  return x < 0;
+}
+
+template <typename T>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if<std::is_unsigned<T>::value, bool>::type is_negative(T)
+{
+  return false;
+}
+
 template <typename Integer>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_odd(Integer x)
 {
diff --git a/thrust/thrust/detail/malloc_and_free.h b/thrust/thrust/detail/malloc_and_free.h
index af8491c176..2b0305cd9b 100644
--- a/thrust/thrust/detail/malloc_and_free.h
+++ b/thrust/thrust/detail/malloc_and_free.h
@@ -63,7 +63,7 @@ malloc(const thrust::detail::execution_policy_base<DerivedPolicy>& exec, std::si
 }
 
 // XXX WAR nvbug 992955
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  if CUDART_VERSION < 5000
 
 // cudafe generates unqualified calls to free(int *volatile)
@@ -75,7 +75,7 @@ inline _CCCL_HOST_DEVICE void free(int* volatile ptr)
 }
 
 #  endif // CUDART_VERSION
-#endif // THRUST_DEVICE_COMPILER
+#endif // _CCCL_CUDA_COMPILER
 
 _CCCL_EXEC_CHECK_DISABLE
 template <typename DerivedPolicy, typename Pointer>
diff --git a/thrust/thrust/detail/memory_algorithms.h b/thrust/thrust/detail/memory_algorithms.h
index 7c4a664db7..e6b8531b94 100644
--- a/thrust/thrust/detail/memory_algorithms.h
+++ b/thrust/thrust/detail/memory_algorithms.h
@@ -33,13 +33,13 @@ THRUST_NAMESPACE_BEGIN
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename T>
-_CCCL_HOST_DEVICE void destroy_at(T* location)
+_CCCL_HOST_DEVICE void destroy_at(T* location) noexcept
 {
   location->~T();
 }
 
 template <typename Allocator, typename T>
-_CCCL_HOST_DEVICE void destroy_at(Allocator const& alloc, T* location)
+_CCCL_HOST_DEVICE void destroy_at(Allocator const& alloc, T* location) noexcept
 {
   using traits = typename detail::allocator_traits<
     ::cuda::std::__remove_cv_t<::cuda::std::__libcpp_remove_reference_t<Allocator>>>::template rebind_traits<T>::other;
@@ -50,7 +50,7 @@ _CCCL_HOST_DEVICE void destroy_at(Allocator const& alloc, T* location)
 }
 
 template <typename ForwardIt>
-_CCCL_HOST_DEVICE ForwardIt destroy(ForwardIt first, ForwardIt last)
+_CCCL_HOST_DEVICE ForwardIt destroy(ForwardIt first, ForwardIt last) noexcept
 {
   for (; first != last; ++first)
   {
@@ -61,7 +61,7 @@ _CCCL_HOST_DEVICE ForwardIt destroy(ForwardIt first, ForwardIt last)
 }
 
 template <typename Allocator, typename ForwardIt>
-_CCCL_HOST_DEVICE ForwardIt destroy(Allocator const& alloc, ForwardIt first, ForwardIt last)
+_CCCL_HOST_DEVICE ForwardIt destroy(Allocator const& alloc, ForwardIt first, ForwardIt last) noexcept
 {
   using T      = typename iterator_traits<ForwardIt>::value_type;
   using traits = typename detail::allocator_traits<
@@ -78,7 +78,7 @@ _CCCL_HOST_DEVICE ForwardIt destroy(Allocator const& alloc, ForwardIt first, For
 }
 
 template <typename ForwardIt, typename Size>
-_CCCL_HOST_DEVICE ForwardIt destroy_n(ForwardIt first, Size n)
+_CCCL_HOST_DEVICE ForwardIt destroy_n(ForwardIt first, Size n) noexcept
 {
   for (; n > 0; (void) ++first, --n)
   {
@@ -89,7 +89,7 @@ _CCCL_HOST_DEVICE ForwardIt destroy_n(ForwardIt first, Size n)
 }
 
 template <typename Allocator, typename ForwardIt, typename Size>
-_CCCL_HOST_DEVICE ForwardIt destroy_n(Allocator const& alloc, ForwardIt first, Size n)
+_CCCL_HOST_DEVICE ForwardIt destroy_n(Allocator const& alloc, ForwardIt first, Size n) noexcept
 {
   using T      = typename iterator_traits<ForwardIt>::value_type;
   using traits = typename detail::allocator_traits<
diff --git a/thrust/thrust/detail/reference.h b/thrust/thrust/detail/reference.h
index c96649f74e..88fdf3d87d 100644
--- a/thrust/thrust/detail/reference.h
+++ b/thrust/thrust/detail/reference.h
@@ -308,7 +308,7 @@ class reference
   pointer const ptr;
 
   // `thrust::detail::is_wrapped_reference` is a trait that indicates whether
-  // a type is a fancy reference. It detects such types by loooking for a
+  // a type is a fancy reference. It detects such types by looking for a
   // nested `wrapped_reference_hint` type.
   struct wrapped_reference_hint
   {};
diff --git a/thrust/thrust/detail/scan.inl b/thrust/thrust/detail/scan.inl
index bec8924bae..fc046e8b77 100644
--- a/thrust/thrust/detail/scan.inl
+++ b/thrust/thrust/detail/scan.inl
@@ -60,6 +60,21 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan(
   return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op);
 } // end inclusive_scan()
 
+_CCCL_EXEC_CHECK_DISABLE
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T, typename AssociativeOperator>
+_CCCL_HOST_DEVICE OutputIterator inclusive_scan(
+  const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+  InputIterator first,
+  InputIterator last,
+  OutputIterator result,
+  T init,
+  AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::inclusive_scan;
+  return inclusive_scan(
+    thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init, binary_op);
+} // end inclusive_scan()
+
 _CCCL_EXEC_CHECK_DISABLE
 template <typename DerivedPolicy, typename InputIterator, typename OutputIterator>
 _CCCL_HOST_DEVICE OutputIterator exclusive_scan(
@@ -268,6 +283,21 @@ OutputIterator inclusive_scan(InputIterator first, InputIterator last, OutputIte
   return thrust::inclusive_scan(select_system(system1, system2), first, last, result, binary_op);
 } // end inclusive_scan()
 
+template <typename InputIterator, typename OutputIterator, typename T, typename BinaryFunction>
+OutputIterator
+inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, T init, BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  using System1 = typename thrust::iterator_system<InputIterator>::type;
+  using System2 = typename thrust::iterator_system<OutputIterator>::type;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::inclusive_scan(select_system(system1, system2), first, last, result, init, binary_op);
+} // end inclusive_scan()
+
 template <typename InputIterator, typename OutputIterator>
 OutputIterator exclusive_scan(InputIterator first, InputIterator last, OutputIterator result)
 {
diff --git a/thrust/thrust/detail/transform_scan.inl b/thrust/thrust/detail/transform_scan.inl
index d8190658b6..f36fe9e348 100644
--- a/thrust/thrust/detail/transform_scan.inl
+++ b/thrust/thrust/detail/transform_scan.inl
@@ -53,6 +53,27 @@ _CCCL_HOST_DEVICE OutputIterator transform_inclusive_scan(
     thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, unary_op, binary_op);
 } // end transform_inclusive_scan()
 
+_CCCL_EXEC_CHECK_DISABLE
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename UnaryFunction,
+          typename T,
+          typename AssociativeOperator>
+_CCCL_HOST_DEVICE OutputIterator transform_inclusive_scan(
+  const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+  InputIterator first,
+  InputIterator last,
+  OutputIterator result,
+  UnaryFunction unary_op,
+  T init,
+  AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::transform_inclusive_scan;
+  return transform_inclusive_scan(
+    thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, unary_op, init, binary_op);
+}
+
 _CCCL_EXEC_CHECK_DISABLE
 template <typename DerivedPolicy,
           typename InputIterator,
@@ -89,6 +110,27 @@ OutputIterator transform_inclusive_scan(
   return thrust::transform_inclusive_scan(select_system(system1, system2), first, last, result, unary_op, binary_op);
 } // end transform_inclusive_scan()
 
+template <typename InputIterator, typename OutputIterator, typename UnaryFunction, typename T, typename AssociativeOperator>
+OutputIterator transform_inclusive_scan(
+  InputIterator first,
+  InputIterator last,
+  OutputIterator result,
+  UnaryFunction unary_op,
+  T init,
+  AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  using System1 = typename thrust::iterator_system<InputIterator>::type;
+  using System2 = typename thrust::iterator_system<OutputIterator>::type;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::transform_inclusive_scan(
+    select_system(system1, system2), first, last, result, unary_op, init, binary_op);
+} // end transform_inclusive_scan()
+
 template <typename InputIterator, typename OutputIterator, typename UnaryFunction, typename T, typename AssociativeOperator>
 OutputIterator transform_exclusive_scan(
   InputIterator first,
diff --git a/thrust/thrust/detail/vector_base.h b/thrust/thrust/detail/vector_base.h
index e11b19cfbe..2da88cf5d8 100644
--- a/thrust/thrust/detail/vector_base.h
+++ b/thrust/thrust/detail/vector_base.h
@@ -37,6 +37,8 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/reverse_iterator.h>
 
+#include <cuda/std/__iterator/iterator_traits.h>
+
 #include <initializer_list>
 #include <vector>
 
@@ -186,7 +188,8 @@ class vector_base
    *  \param first The beginning of the range.
    *  \param last The end of the range.
    */
-  template <typename InputIterator>
+  template <typename InputIterator,
+            ::cuda::std::__enable_if_t<::cuda::std::__is_cpp17_input_iterator<InputIterator>::value, int> = 0>
   vector_base(InputIterator first, InputIterator last);
 
   /*! This constructor builds a vector_base from a range.
@@ -194,7 +197,8 @@ class vector_base
    *  \param last The end of the range.
    *  \param alloc The allocator to use by this vector_base.
    */
-  template <typename InputIterator>
+  template <typename InputIterator,
+            ::cuda::std::__enable_if_t<::cuda::std::__is_cpp17_input_iterator<InputIterator>::value, int> = 0>
   vector_base(InputIterator first, InputIterator last, const Alloc& alloc);
 
   /*! The destructor erases the elements.
diff --git a/thrust/thrust/detail/vector_base.inl b/thrust/thrust/detail/vector_base.inl
index 5f0cb87e12..f37da9791e 100644
--- a/thrust/thrust/detail/vector_base.inl
+++ b/thrust/thrust/detail/vector_base.inl
@@ -270,7 +270,8 @@ void vector_base<T, Alloc>::range_init(ForwardIterator first, ForwardIterator la
 } // end vector_base::range_init()
 
 template <typename T, typename Alloc>
-template <typename InputIterator>
+template <typename InputIterator,
+          ::cuda::std::__enable_if_t<::cuda::std::__is_cpp17_input_iterator<InputIterator>::value, int>>
 vector_base<T, Alloc>::vector_base(InputIterator first, InputIterator last)
     : m_storage()
     , m_size(0)
@@ -283,7 +284,8 @@ vector_base<T, Alloc>::vector_base(InputIterator first, InputIterator last)
 } // end vector_base::vector_base()
 
 template <typename T, typename Alloc>
-template <typename InputIterator>
+template <typename InputIterator,
+          ::cuda::std::__enable_if_t<::cuda::std::__is_cpp17_input_iterator<InputIterator>::value, int>>
 vector_base<T, Alloc>::vector_base(InputIterator first, InputIterator last, const Alloc& alloc)
     : m_storage(alloc)
     , m_size(0)
diff --git a/thrust/thrust/device_malloc_allocator.h b/thrust/thrust/device_malloc_allocator.h
index cc9167d1f6..e5d2e04fc1 100644
--- a/thrust/thrust/device_malloc_allocator.h
+++ b/thrust/thrust/device_malloc_allocator.h
@@ -154,7 +154,7 @@ class device_malloc_allocator
    *  \note Memory deallocated by this function must previously have been
    *        allocated with \p allocate.
    */
-  _CCCL_HOST inline void deallocate(pointer p, size_type cnt)
+  _CCCL_HOST inline void deallocate(pointer p, size_type cnt) noexcept
   {
     // silence unused parameter warning while still leaving the parameter name for Doxygen
     (void) (cnt);
diff --git a/thrust/thrust/device_new_allocator.h b/thrust/thrust/device_new_allocator.h
index 015eccd484..f9052a0dcf 100644
--- a/thrust/thrust/device_new_allocator.h
+++ b/thrust/thrust/device_new_allocator.h
@@ -143,7 +143,7 @@ class device_new_allocator
    *  \note Memory deallocated by this function must previously have been
    *        allocated with \p allocate.
    */
-  _CCCL_HOST inline void deallocate(pointer p, size_type cnt)
+  _CCCL_HOST inline void deallocate(pointer p, size_type cnt) noexcept
   {
     // use "::operator delete" rather than keyword delete
     (void) cnt;
diff --git a/thrust/thrust/execution_policy.h b/thrust/thrust/execution_policy.h
index c44ef355f2..56199aa5e9 100644
--- a/thrust/thrust/execution_policy.h
+++ b/thrust/thrust/execution_policy.h
@@ -55,7 +55,7 @@ THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
-using host_t = thrust::system::cpp::detail::par_t;
+using host_t = thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::detail::par_t;
 
 using device_t = thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::detail::par_t;
 
diff --git a/thrust/thrust/functional.h b/thrust/thrust/functional.h
index 9c8d8d2938..27a7552063 100644
--- a/thrust/thrust/functional.h
+++ b/thrust/thrust/functional.h
@@ -29,8 +29,11 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
+
 #include <thrust/detail/functional/actor.h>
 
+#include <cuda/std/functional>
+
 #include <functional>
 
 THRUST_NAMESPACE_BEGIN
diff --git a/thrust/thrust/iterator/detail/tabulate_output_iterator.inl b/thrust/thrust/iterator/detail/tabulate_output_iterator.inl
new file mode 100644
index 0000000000..f9b740bca6
--- /dev/null
+++ b/thrust/thrust/iterator/detail/tabulate_output_iterator.inl
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename BinaryFunction, typename System, typename DifferenceT>
+class tabulate_output_iterator;
+
+namespace detail
+{
+
+// Proxy reference that invokes a BinaryFunction with the index of the dereferenced iterator and the assigned value
+template <typename BinaryFunction, typename DifferenceT>
+class tabulate_output_iterator_proxy
+{
+public:
+  _CCCL_HOST_DEVICE tabulate_output_iterator_proxy(BinaryFunction fun, DifferenceT index)
+      : fun(fun)
+      , index(index)
+  {}
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T>
+  _CCCL_HOST_DEVICE tabulate_output_iterator_proxy operator=(const T& x)
+  {
+    fun(index, x);
+    return *this;
+  }
+
+private:
+  BinaryFunction fun;
+  DifferenceT index;
+};
+
+// Alias template for the iterator_adaptor instantiation to be used for tabulate_output_iterator
+template <typename BinaryFunction, typename System, typename DifferenceT>
+using tabulate_output_iterator_base =
+  thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, System, DifferenceT>,
+                           counting_iterator<DifferenceT>,
+                           thrust::use_default,
+                           System,
+                           thrust::use_default,
+                           tabulate_output_iterator_proxy<BinaryFunction, DifferenceT>>;
+
+// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <class BinaryFunction, class OutputIterator>
+struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, OutputIterator>>
+    : public thrust::detail::true_type
+{};
+
+} // namespace detail
+THRUST_NAMESPACE_END
diff --git a/thrust/thrust/iterator/tabulate_output_iterator.h b/thrust/thrust/iterator/tabulate_output_iterator.h
new file mode 100644
index 0000000000..af9a244063
--- /dev/null
+++ b/thrust/thrust/iterator/tabulate_output_iterator.h
@@ -0,0 +1,117 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <thrust/iterator/detail/tabulate_output_iterator.inl>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p tabulate_output_iterator is a special kind of output iterator which, whenever a value is assigned to a
+ * dereferenced iterator, calls the given callable with the index that corresponds to the offset of the dereferenced
+ * iterator and the assigned value.
+ *
+ * The following code snippet demonstrated how to create a \p tabulate_output_iterator which prints the index and the
+ * assigned value.
+ *
+ * \code
+ * #include <thrust/iterator/tabulate_output_iterator.h>
+ *
+ *  // note: functor inherits form binary function
+ *  struct print_op
+ *  {
+ *    __host__ __device__
+ *    void operator()(int index, float value) const
+ *    {
+ *      printf("%d: %f\n", index, value);
+ *    }
+ *  };
+ *
+ *  int main()
+ *  {
+ *    auto tabulate_it = thrust::make_tabulate_output_iterator(print_op{});
+ *
+ *    tabulate_it[0] =  1.0f;    // prints: 0: 1.0
+ *    tabulate_it[1] =  3.0f;    // prints: 1: 3.0
+ *    tabulate_it[9] =  5.0f;    // prints: 9: 5.0
+ *  }
+ *  \endcode
+ *
+ *  \see make_tabulate_output_iterator
+ */
+
+template <typename BinaryFunction, typename System = use_default, typename DifferenceT = ptrdiff_t>
+class tabulate_output_iterator : public detail::tabulate_output_iterator_base<BinaryFunction, System, DifferenceT>
+{
+  /*! \cond
+   */
+
+public:
+  using super_t = detail::tabulate_output_iterator_base<BinaryFunction, System, DifferenceT>;
+
+  friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  tabulate_output_iterator() = default;
+
+  /*! This constructor takes as argument a \c BinaryFunction and copies it to a new \p tabulate_output_iterator
+   *
+   * \param fun A \c BinaryFunction called whenever a value is assigned to this \p tabulate_output_iterator.
+   */
+  _CCCL_HOST_DEVICE tabulate_output_iterator(BinaryFunction fun)
+      : fun(fun)
+  {}
+
+  /*! \cond
+   */
+
+private:
+  _CCCL_HOST_DEVICE typename super_t::reference dereference() const
+  {
+    return detail::tabulate_output_iterator_proxy<BinaryFunction, DifferenceT>(fun, *this->base());
+  }
+
+  BinaryFunction fun;
+
+  /*! \endcond
+   */
+}; // end tabulate_output_iterator
+
+/*! \p make_tabulate_output_iterator creates a \p tabulate_output_iterator from a \c BinaryFunction.
+ *
+ *  \param fun The \c BinaryFunction invoked whenever assigning to a dereferenced \p tabulate_output_iterator
+ *  \see tabulate_output_iterator
+ */
+template <typename BinaryFunction>
+tabulate_output_iterator<BinaryFunction> _CCCL_HOST_DEVICE make_tabulate_output_iterator(BinaryFunction fun)
+{
+  return tabulate_output_iterator<BinaryFunction>(fun);
+} // end make_tabulate_output_iterator
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+THRUST_NAMESPACE_END
diff --git a/thrust/thrust/mr/allocator.h b/thrust/thrust/mr/allocator.h
index a949bff252..37bc08c742 100644
--- a/thrust/thrust/mr/allocator.h
+++ b/thrust/thrust/mr/allocator.h
@@ -137,7 +137,7 @@ class allocator : private validator<MR>
    *  \param p pointer returned by a previous call to \p allocate
    *  \param n number of elements, passed as an argument to the \p allocate call that produced \p p
    */
-  _CCCL_HOST void deallocate(pointer p, size_type n)
+  _CCCL_HOST void deallocate(pointer p, size_type n) noexcept
   {
     return mem_res->do_deallocate(p, n * sizeof(T), alignof(T));
   }
diff --git a/thrust/thrust/mr/host_memory_resource.h b/thrust/thrust/mr/host_memory_resource.h
index da77e3889a..82b5225f0c 100644
--- a/thrust/thrust/mr/host_memory_resource.h
+++ b/thrust/thrust/mr/host_memory_resource.h
@@ -33,6 +33,6 @@
 
 THRUST_NAMESPACE_BEGIN
 
-using host_memory_resource = thrust::system::cpp::memory_resource;
+using host_memory_resource = thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::memory_resource;
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/mr/memory_resource.h b/thrust/thrust/mr/memory_resource.h
index 219210ff0f..6179cc2f18 100644
--- a/thrust/thrust/mr/memory_resource.h
+++ b/thrust/thrust/mr/memory_resource.h
@@ -86,7 +86,7 @@ class memory_resource
    *  \param alignment the alignment of the allocation. This must be equivalent to the value of \p alignment
    *      that was passed to the allocation function that returned \p p.
    */
-  void deallocate(pointer p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+  void deallocate(pointer p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) noexcept
   {
     do_deallocate(p, bytes, alignment);
   }
@@ -149,7 +149,7 @@ class memory_resource<void*>
     return do_allocate(bytes, alignment);
   }
 
-  void deallocate(pointer p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+  void deallocate(pointer p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) noexcept
   {
     do_deallocate(p, bytes, alignment);
   }
diff --git a/thrust/thrust/random/detail/normal_distribution.inl b/thrust/thrust/random/detail/normal_distribution.inl
index db17798d90..9713a1ccd2 100644
--- a/thrust/thrust/random/detail/normal_distribution.inl
+++ b/thrust/thrust/random/detail/normal_distribution.inl
@@ -30,14 +30,9 @@
 #include <thrust/random/normal_distribution.h>
 #include <thrust/random/uniform_real_distribution.h>
 
-#include <cstdint>
+#include <cuda/std/limits>
 
-// for floating point infinity
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#  include <math_constants.h>
-#else
-#  include <limits>
-#endif
+#include <cstdint>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -94,25 +89,14 @@ template <typename RealType>
 _CCCL_HOST_DEVICE typename normal_distribution<RealType>::result_type normal_distribution<RealType>::min
 THRUST_PREVENT_MACRO_SUBSTITUTION() const
 {
-  return -this->max THRUST_PREVENT_MACRO_SUBSTITUTION();
+  return ::cuda::std::numeric_limits<RealType>::lowest();
 } // end normal_distribution::min()
 
 template <typename RealType>
 _CCCL_HOST_DEVICE typename normal_distribution<RealType>::result_type normal_distribution<RealType>::max
 THRUST_PREVENT_MACRO_SUBSTITUTION() const
 {
-  // XXX this solution is pretty terrible
-  // we can't use numeric_traits<RealType>::max because nvcc will
-  // complain that it is a __host__ function
-  union
-  {
-    std::uint32_t inf_as_int;
-    float result;
-  } hack;
-
-  hack.inf_as_int = 0x7f800000u;
-
-  return hack.result;
+  return ::cuda::std::numeric_limits<RealType>::max();
 } // end normal_distribution::max()
 
 template <typename RealType>
diff --git a/thrust/thrust/random/detail/normal_distribution_base.h b/thrust/thrust/random/detail/normal_distribution_base.h
index 58b36bf991..a961cb6da9 100644
--- a/thrust/thrust/random/detail/normal_distribution_base.h
+++ b/thrust/thrust/random/detail/normal_distribution_base.h
@@ -148,7 +148,7 @@ class normal_distribution_portable
 template <typename RealType>
 struct normal_distribution_base
 {
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && !defined(_NVHPC_CUDA)
+#if defined(_CCCL_CUDA_COMPILER) && !defined(_NVHPC_CUDA)
   using type = normal_distribution_nvcc<RealType>;
 #else
   using type = normal_distribution_portable<RealType>;
diff --git a/thrust/thrust/scan.h b/thrust/thrust/scan.h
index 44a32ebaec..e54265ecff 100644
--- a/thrust/thrust/scan.h
+++ b/thrust/thrust/scan.h
@@ -263,6 +263,109 @@ template <typename InputIterator, typename OutputIterator, typename AssociativeO
 OutputIterator
 inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, AssociativeOperator binary_op);
 
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum. More precisely,
+ *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>*result</tt>
+ *  and so on. This version of \p inclusive_scan requires both an associative
+ *  operator and an initial value \p init.  When the input and
+ *  output sequences are the same, the scan is performed in-place.
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input
+ * Iterator</a> and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam
+ * OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
+ * and \c OutputIterator's \c value_type is convertible to both \c AssociativeOperator's \c first_argument_type and
+ * \c second_argument_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a
+ * href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and \c
+ * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
+ * first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan with initial value to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *  thrust::inclusive_scan(thrust::host, data, data + 10, data, 1, thrust::maximum<>{}); // in-place scan
+ *  // data is now {1, 1, 2, 2, 2, 4, 4, 4, 4, 8}
+ *  \endcode
+ *
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
+ */
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T, typename AssociativeOperator>
+_CCCL_HOST_DEVICE OutputIterator inclusive_scan(
+  const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+  InputIterator first,
+  InputIterator last,
+  OutputIterator result,
+  T init,
+  AssociativeOperator binary_op);
+
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum. More precisely,
+ *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>*result</tt>
+ *  and so on. This version of \p inclusive_scan requires both an associative
+ *  operator and an initial value \p init.  When the input and
+ *  output sequences are the same, the scan is performed in-place.
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input
+ * Iterator</a> and \c InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type. \tparam
+ * OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
+ * and \c OutputIterator's \c value_type is convertible to both \c AssociativeOperator's \c first_argument_type and
+ * \c second_argument_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a
+ * href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a> and \c
+ * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
+ * first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan with initial value:
+ *
+ *  \code
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *  thrust::maximum<int> binary_op;
+ *  thrust::inclusive_scan(data, data + 10, data, 1, thrust::maximum<>{}); // in-place scan
+ *  // data is now {1, 1, 2, 2, 2, 4, 4, 4, 4, 8}
+ *  \endcode
+ *
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
+ */
+template <typename InputIterator, typename OutputIterator, typename T, typename AssociativeOperator>
+OutputIterator
+inclusive_scan(InputIterator first, InputIterator last, OutputIterator result, T init, AssociativeOperator binary_op);
+
 /*! \p exclusive_scan computes an exclusive prefix sum operation. The
  *  term 'exclusive' means that each result does not include the
  *  corresponding input operand in the partial sum.  More precisely,
diff --git a/thrust/thrust/system/cpp/pointer.h b/thrust/thrust/system/cpp/pointer.h
index 5b5dfd6326..1913886157 100644
--- a/thrust/thrust/system/cpp/pointer.h
+++ b/thrust/thrust/system/cpp/pointer.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-/*! \file thrust/system/cpp/memory.h
- *  \brief Managing memory associated with Thrust's TBB system.
- */
+//! \file
+//! Managing memory associated with Thrust's TBB system.
 
 #pragma once
 
diff --git a/thrust/thrust/system/cuda/config.h b/thrust/thrust/system/cuda/config.h
index 28d47fcd03..c5c7f73242 100644
--- a/thrust/thrust/system/cuda/config.h
+++ b/thrust/thrust/system/cuda/config.h
@@ -33,7 +33,7 @@
 #  define THRUST_DEBUG_SYNC_FLAG false
 #endif
 
-#include <thrust/detail/config.h>
+#include <thrust/detail/config.h> // IWYU pragma: export
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -45,9 +45,9 @@
 
 // We don't directly include <cub/version.cuh> since it doesn't exist in
 // older releases. This header will always pull in version info:
-#include <cub/detail/detect_cuda_runtime.cuh>
-#include <cub/util_debug.cuh>
-#include <cub/util_namespace.cuh>
+#include <cub/detail/detect_cuda_runtime.cuh> // IWYU pragma: export
+#include <cub/util_debug.cuh> // IWYU pragma: export
+#include <cub/util_namespace.cuh> // IWYU pragma: export
 
 /**
  * \def THRUST_RUNTIME_FUNCTION
diff --git a/thrust/thrust/system/cuda/detail/adjacent_difference.h b/thrust/thrust/system/cuda/detail/adjacent_difference.h
index ed157e85b9..8c89766529 100644
--- a/thrust/thrust/system/cuda/detail/adjacent_difference.h
+++ b/thrust/thrust/system/cuda/detail/adjacent_difference.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/assign_value.h b/thrust/thrust/system/cuda/detail/assign_value.h
index 2d1a06367c..1ef45d441c 100644
--- a/thrust/thrust/system/cuda/detail/assign_value.h
+++ b/thrust/thrust/system/cuda/detail/assign_value.h
@@ -26,7 +26,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/detail/raw_pointer_cast.h>
diff --git a/thrust/thrust/system/cuda/detail/async/copy.h b/thrust/thrust/system/cuda/detail/async/copy.h
index bf1958792b..dbec4f6ca1 100644
--- a/thrust/thrust/system/cuda/detail/async/copy.h
+++ b/thrust/thrust/system/cuda/detail/async/copy.h
@@ -42,7 +42,7 @@
 
 #if _CCCL_STD_VER >= 2014
 
-#  if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  ifdef _CCCL_CUDA_COMPILER
 
 #    include <thrust/system/cuda/config.h>
 
@@ -357,6 +357,6 @@ auto async_copy(thrust::cuda::execution_policy<FromPolicy>& from_exec,
 
 THRUST_NAMESPACE_END
 
-#  endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  endif // _CCCL_CUDA_COMPILER
 
 #endif
diff --git a/thrust/thrust/system/cuda/detail/async/customization.h b/thrust/thrust/system/cuda/detail/async/customization.h
index 1c766f133e..580bd051c6 100644
--- a/thrust/thrust/system/cuda/detail/async/customization.h
+++ b/thrust/thrust/system/cuda/detail/async/customization.h
@@ -42,7 +42,7 @@
 
 #if _CCCL_STD_VER >= 2014
 
-#  if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  ifdef _CCCL_CUDA_COMPILER
 
 #    include <thrust/system/cuda/config.h>
 
@@ -106,6 +106,6 @@ auto get_async_universal_host_pinned_allocator(thrust::detail::execution_policy_
 
 THRUST_NAMESPACE_END
 
-#  endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  endif // _CCCL_CUDA_COMPILER
 
 #endif
diff --git a/thrust/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
index eecd39be29..9f27cba0da 100644
--- a/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
+++ b/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
@@ -40,7 +40,7 @@
 
 #if _CCCL_STD_VER >= 2014
 
-#  if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  ifdef _CCCL_CUDA_COMPILER
 
 #    include <thrust/system/cuda/config.h>
 
@@ -161,6 +161,6 @@ auto async_exclusive_scan(
 
 THRUST_NAMESPACE_END
 
-#  endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  endif // _CCCL_CUDA_COMPILER
 
 #endif // C++14
diff --git a/thrust/thrust/system/cuda/detail/async/for_each.h b/thrust/thrust/system/cuda/detail/async/for_each.h
index b0fb05a90c..11ddcb1c4b 100644
--- a/thrust/thrust/system/cuda/detail/async/for_each.h
+++ b/thrust/thrust/system/cuda/detail/async/for_each.h
@@ -43,7 +43,7 @@
 
 #if _CCCL_STD_VER >= 2014
 
-#  if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  ifdef _CCCL_CUDA_COMPILER
 
 #    include <thrust/system/cuda/config.h>
 
@@ -129,6 +129,6 @@ auto async_for_each(execution_policy<DerivedPolicy>& policy, ForwardIt first, Se
 
 THRUST_NAMESPACE_END
 
-#  endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  endif // _CCCL_CUDA_COMPILER
 
 #endif
diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
index 6388c59fab..a91310dce0 100644
--- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
+++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -40,7 +40,7 @@
 
 #if _CCCL_STD_VER >= 2014
 
-#  if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  ifdef _CCCL_CUDA_COMPILER
 
 #    include <thrust/system/cuda/config.h>
 
@@ -126,6 +126,93 @@ async_inclusive_scan_n(execution_policy<DerivedPolicy>& policy, ForwardIt first,
   return ev;
 }
 
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+unique_eager_event async_inclusive_scan_n(
+  execution_policy<DerivedPolicy>& policy, ForwardIt first, Size n, OutputIt out, InitialValueType init, BinaryOp op)
+{
+  using InputValueT = cub::detail::InputValue<InitialValueType>;
+  using AccumT      = typename ::cuda::std::
+    __accumulator_t<BinaryOp, typename ::cuda::std::iterator_traits<ForwardIt>::value_type, InitialValueType>;
+  constexpr bool ForceInclusive = true;
+
+  using Dispatch32 =
+    cub::DispatchScan<ForwardIt,
+                      OutputIt,
+                      BinaryOp,
+                      InputValueT,
+                      std::int32_t,
+                      AccumT,
+                      cub::DeviceScanPolicy<AccumT, BinaryOp>,
+                      ForceInclusive>;
+  using Dispatch64 =
+    cub::DispatchScan<ForwardIt,
+                      OutputIt,
+                      BinaryOp,
+                      InputValueT,
+                      std::int64_t,
+                      AccumT,
+                      cub::DeviceScanPolicy<AccumT, BinaryOp>,
+                      ForceInclusive>;
+
+  InputValueT init_value(init);
+
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  cudaError_t status;
+  size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(
+      status,
+      Dispatch32::Dispatch,
+      Dispatch64::Dispatch,
+      n,
+      (nullptr, tmp_size, first, out, op, init_value, n_fixed, nullptr));
+    thrust::cuda_cub::throw_on_error(
+      status,
+      "after determining tmp storage "
+      "requirements for inclusive_scan");
+  }
+
+  // Allocate temporary storage.
+  auto content        = uninitialized_allocate_unique_n<std::uint8_t>(device_alloc, tmp_size);
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(std::make_tuple(std::move(content), unique_stream(nonowning, user_raw_stream)),
+                     extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(std::tuple_cat(
+      std::make_tuple(std::move(content)), extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(
+      status,
+      Dispatch32::Dispatch,
+      Dispatch64::Dispatch,
+      n,
+      (tmp_ptr, tmp_size, first, out, op, init_value, n_fixed, user_raw_stream));
+    thrust::cuda_cub::throw_on_error(status, "after dispatching inclusive_scan kernel");
+  }
+
+  return ev;
+}
+
 } // namespace detail
 } // namespace cuda
 } // namespace system
@@ -140,10 +227,27 @@ auto async_inclusive_scan(
   THRUST_RETURNS(thrust::system::cuda::detail::async_inclusive_scan_n(
     policy, first, thrust::distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(op)))
 
+  // ADL entry point.
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp>
+  auto async_inclusive_scan(
+    execution_policy<DerivedPolicy>& policy,
+    ForwardIt first,
+    Sentinel&& last,
+    OutputIt&& out,
+    InitialValueType&& init,
+    BinaryOp&& op)
+    THRUST_RETURNS(thrust::system::cuda::detail::async_inclusive_scan_n(
+      policy, first, thrust::distance(first, THRUST_FWD(last)), THRUST_FWD(out), THRUST_FWD(init), THRUST_FWD(op)))
+
 } // namespace cuda_cub
 
 THRUST_NAMESPACE_END
 
-#  endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  endif // _CCCL_CUDA_COMPILER
 
 #endif // C++14
diff --git a/thrust/thrust/system/cuda/detail/async/reduce.h b/thrust/thrust/system/cuda/detail/async/reduce.h
index 066e1b6578..8494002ecf 100644
--- a/thrust/thrust/system/cuda/detail/async/reduce.h
+++ b/thrust/thrust/system/cuda/detail/async/reduce.h
@@ -44,7 +44,7 @@
 
 #if _CCCL_STD_VER >= 2014
 
-#  if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  ifdef _CCCL_CUDA_COMPILER
 
 #    include <thrust/system/cuda/config.h>
 
@@ -228,6 +228,6 @@ auto async_reduce_into(
 
 THRUST_NAMESPACE_END
 
-#  endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  endif // _CCCL_CUDA_COMPILER
 
 #endif
diff --git a/thrust/thrust/system/cuda/detail/async/sort.h b/thrust/thrust/system/cuda/detail/async/sort.h
index 96fdb179b9..9f5032269d 100644
--- a/thrust/thrust/system/cuda/detail/async/sort.h
+++ b/thrust/thrust/system/cuda/detail/async/sort.h
@@ -42,7 +42,7 @@
 
 #if _CCCL_STD_VER >= 2014
 
-#  if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  ifdef _CCCL_CUDA_COMPILER
 
 #    include <thrust/system/cuda/config.h>
 
@@ -321,6 +321,6 @@ auto async_stable_sort(execution_policy<DerivedPolicy>& policy, ForwardIt first,
 
 THRUST_NAMESPACE_END
 
-#  endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  endif // _CCCL_CUDA_COMPILER
 
 #endif
diff --git a/thrust/thrust/system/cuda/detail/async/transform.h b/thrust/thrust/system/cuda/detail/async/transform.h
index d754081a2d..99050daa99 100644
--- a/thrust/thrust/system/cuda/detail/async/transform.h
+++ b/thrust/thrust/system/cuda/detail/async/transform.h
@@ -42,7 +42,7 @@
 
 #if _CCCL_STD_VER >= 2014
 
-#  if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  ifdef _CCCL_CUDA_COMPILER
 
 #    include <thrust/system/cuda/config.h>
 
@@ -134,6 +134,6 @@ auto async_transform(
 
 THRUST_NAMESPACE_END
 
-#  endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#  endif // _CCCL_CUDA_COMPILER
 
 #endif
diff --git a/thrust/thrust/system/cuda/detail/copy.h b/thrust/thrust/system/cuda/detail/copy.h
index 57b2e2ba15..bd9e3ade36 100644
--- a/thrust/thrust/system/cuda/detail/copy.h
+++ b/thrust/thrust/system/cuda/detail/copy.h
@@ -84,7 +84,7 @@ THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 // D->D copy requires NVCC compiler
 
 _CCCL_EXEC_CHECK_DISABLE
diff --git a/thrust/thrust/system/cuda/detail/copy_if.h b/thrust/thrust/system/cuda/detail/copy_if.h
index 076186eb4e..19dd014f59 100644
--- a/thrust/thrust/system/cuda/detail/copy_if.h
+++ b/thrust/thrust/system/cuda/detail/copy_if.h
@@ -35,8 +35,7 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
index 6d8fe048bd..a3ecde6b34 100644
--- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h
+++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
@@ -38,7 +38,7 @@
 
 #include <cub/detail/device_synchronize.cuh>
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 #  include <thrust/system/cuda/detail/core/util.h>
 
diff --git a/thrust/thrust/system/cuda/detail/count.h b/thrust/thrust/system/cuda/detail/count.h
index 965da9b608..c93adb81c5 100644
--- a/thrust/thrust/system/cuda/detail/count.h
+++ b/thrust/thrust/system/cuda/detail/count.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/distance.h>
diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
index 90c99688f7..3d004aa553 100644
--- a/thrust/thrust/system/cuda/detail/dispatch.h
+++ b/thrust/thrust/system/cuda/detail/dispatch.h
@@ -26,81 +26,186 @@
 #  pragma system_header
 #endif // no system header
 
+#include <thrust/detail/integer_math.h>
 #include <thrust/detail/integer_traits.h>
 #include <thrust/detail/preprocessor.h>
 
+#include <cuda/std/detail/libcxx/include/stdexcept>
+#include <cuda/std/type_traits>
+
 #include <cstdint>
+#include <string>
 
-/**
- * Dispatch between 32-bit and 64-bit index based versions of the same algorithm implementation. This version assumes
- * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch
- * interfaces, that always deduce the size type from the arguments.
- */
-#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments)         \
-  if (count <= thrust::detail::integer_traits<std::int32_t>::const_max)    \
-  {                                                                        \
-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int32_t>(count); \
-    status                             = call arguments;                   \
-  }                                                                        \
-  else                                                                     \
-  {                                                                        \
-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int64_t>(count); \
-    status                             = call arguments;                   \
-  }
+#if defined(THRUST_FORCE_32_BIT_OFFSET_TYPE) && defined(THRUST_FORCE_64_BIT_OFFSET_TYPE)
+#  error "Only THRUST_FORCE_32_BIT_OFFSET_TYPE or THRUST_FORCE_64_BIT_OFFSET_TYPE may be defined!"
+#endif // THRUST_FORCE_32_BIT_OFFSET_TYPE && THRUST_FORCE_64_BIT_OFFSET_TYPE
 
-/**
- * Dispatch between 32-bit and 64-bit index based versions of the same algorithm implementation. This version assumes
- * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch
- * interfaces, that always deduce the size type from the arguments.
- *
- * This version of the macro supports providing two count variables, which is necessary for set algorithms.
- */
-#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
-  if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)  \
-  {                                                                                \
-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int32_t>(count1);       \
-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int32_t>(count2);       \
-    status                              = call arguments;                          \
-  }                                                                                \
-  else                                                                             \
-  {                                                                                \
-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int64_t>(count1);       \
-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int64_t>(count2);       \
-    status                              = call arguments;                          \
+#define _THRUST_INDEX_TYPE_DISPATCH(index_type, status, call, count, arguments) \
+  {                                                                             \
+    auto THRUST_PP_CAT2(count, _fixed) = static_cast<index_type>(count);        \
+    status                             = call arguments;                        \
   }
 
-/**
- * Dispatch between 32-bit and 64-bit index based versions of the same algorithm implementation. This version allows
- * using different token sequences for callables in both branches, and is intended to be used with CUB-style dispatch
- * interfaces, where the "simple" interface always forces the size to be `int` (making it harder for us to use), but the
- * complex interface that we end up using doesn't actually provide a way to fully deduce the type from just the call,
- * making the size type appear in the token sequence of the callable.
- *
- * See reduce_n_impl to see an example of how this is meant to be used.
- */
-#define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
-  if (count <= thrust::detail::integer_traits<std::int32_t>::const_max)         \
-  {                                                                             \
-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int32_t>(count);      \
-    status                             = call_32 arguments;                     \
-  }                                                                             \
-  else                                                                          \
-  {                                                                             \
-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int64_t>(count);      \
-    status                             = call_64 arguments;                     \
+#define _THRUST_INDEX_TYPE_DISPATCH2(index_type, status, call, count1, count2, arguments) \
+  {                                                                                       \
+    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<index_type>(count1);                \
+    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<index_type>(count2);                \
+    status                              = call arguments;                                 \
   }
 
-/// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts.
-#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \
-  if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)               \
-  {                                                                                             \
-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int32_t>(count1);                    \
-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int32_t>(count2);                    \
-    status                              = call_32 arguments;                                    \
-  }                                                                                             \
-  else                                                                                          \
-  {                                                                                             \
-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int64_t>(count1);                    \
-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int64_t>(count2);                    \
-    status                              = call_64 arguments;                                    \
+#define _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                           \
+  if (thrust::detail::is_negative(count))                                            \
+  {                                                                                  \
+    ::cuda::std::__throw_runtime_error("Invalid input range, passed negative size"); \
   }
+
+#define _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2) \
+  _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1)                \
+  _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2)
+
+#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE)
+//! @brief Always dispatches to 64 bit offset version of an algorithm
+#  define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)               \
+    _THRUST_INDEX_TYPE_DISPATCH(std::int64_t, status, call, count, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH but with two counts
+#  define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2)                     \
+    _THRUST_INDEX_TYPE_DISPATCH2(std::int64_t, status, call, count1, count2, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH but with two different call implementations
+#  define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                            \
+    _THRUST_INDEX_TYPE_DISPATCH(std::int64_t, status, call_64, count, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts.
+#  define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2)                                  \
+    _THRUST_INDEX_TYPE_DISPATCH2(std::int64_t, status, call_64, count1, count2, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but always dispatching to uint64_t. `count` must not be negative.
+#  define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                                     \
+    _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments)
+
+#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE)
+
+//! @brief Ensures that the size of the input does not overflow the offset type
+#  define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count)                       \
+    if (static_cast<std::uint64_t>(count)                                                     \
+        > static_cast<std::uint64_t>(thrust::detail::integer_traits<index_type>::const_max))  \
+    {                                                                                         \
+      ::cuda::std::__throw_runtime_error(                                                     \
+        "Input size exceeds the maximum allowable value for " #index_type                     \
+        ". It was used because the macro THRUST_FORCE_32_BIT_OFFSET_TYPE was defined. "       \
+        "To handle larger input sizes, either remove this macro to dynamically dispatch "     \
+        "between 32-bit and 64-bit index types, or define THRUST_FORCE_64_BIT_OFFSET_TYPE."); \
+    }
+
+//! @brief Ensures that the sizes of the inputs do not overflow the offset type, but two counts
+#  define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW2(index_type, count1, count2)             \
+    if (static_cast<std::uint64_t>(count1) + static_cast<std::uint64_t>(count2)               \
+        > static_cast<std::uint64_t>(thrust::detail::integer_traits<index_type>::const_max))  \
+    {                                                                                         \
+      ::cuda::std::__throw_runtime_error(                                                     \
+        "Input size exceeds the maximum allowable value for " #index_type                     \
+        ". It was used because the macro THRUST_FORCE_32_BIT_OFFSET_TYPE was defined. "       \
+        "To handle larger input sizes, either remove this macro to dynamically dispatch "     \
+        "between 32-bit and 64-bit index types, or define THRUST_FORCE_64_BIT_OFFSET_TYPE."); \
+    }
+
+//! @brief Always dispatches to 32 bit offset version of an algorithm but throws if count would overflow
+#  define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)               \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(std::int32_t, count)  \
+    _THRUST_INDEX_TYPE_DISPATCH(std::int32_t, status, call, count, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH but with two counts
+#  define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2)                     \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW2(std::int32_t, count1, count2)        \
+    _THRUST_INDEX_TYPE_DISPATCH2(std::int32_t, status, call, count1, count2, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH but with two different call implementations
+#  define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                            \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(std::int32_t, count)               \
+    _THRUST_INDEX_TYPE_DISPATCH(std::int32_t, status, call_32, count, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts.
+#  define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2)                                  \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW2(std::int32_t, count1, count2)                     \
+    _THRUST_INDEX_TYPE_DISPATCH2(std::int32_t, status, call_32, count1, count2, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH but always dispatching to uint64_t. `count` must not be negative.
+#  define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                                     \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(std::uint32_t, count)                       \
+    _THRUST_INDEX_TYPE_DISPATCH(std::uint32_t, status, call_32, count, arguments)
+
+#else // ^^^ THRUST_FORCE_32_BIT_OFFSET_TYPE ^^^ / vvv !THRUST_FORCE_32_BIT_OFFSET_TYPE vvv
+
+#  define _THRUST_INDEX_TYPE_DISPATCH_SELECT(index_type, count) \
+    (static_cast<std::uint64_t>(count)                          \
+     <= static_cast<std::uint64_t>(thrust::detail::integer_traits<index_type>::const_max))
+
+#  define _THRUST_INDEX_TYPE_DISPATCH_SELECT2(index_type, count1, count2)    \
+    (static_cast<std::uint64_t>(count1) + static_cast<std::uint64_t>(count2) \
+     <= static_cast<std::uint64_t>(thrust::detail::integer_traits<index_type>::const_max))
+
+//! Dispatch between 32-bit and 64-bit index_type based versions of the same algorithm implementation. This version
+//! assumes that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style
+//! dispatch interfaces, that always deduce the size type from the arguments.
+#  define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments)            \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                          \
+    if _THRUST_INDEX_TYPE_DISPATCH_SELECT (std::int32_t, count)                 \
+      _THRUST_INDEX_TYPE_DISPATCH(std::int32_t, status, call, count, arguments) \
+    else                                                                        \
+      _THRUST_INDEX_TYPE_DISPATCH(std::int64_t, status, call, count, arguments)
+
+//! Dispatch between 32-bit and 64-bit index_type based versions of the same algorithm implementation. This version
+//! assumes that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style
+//! dispatch interfaces, that always deduce the size type from the arguments.
+//!
+//! This version of the macro supports providing two count variables, which is necessary for set algorithms.
+#  define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments)      \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2)                          \
+    if _THRUST_INDEX_TYPE_DISPATCH_SELECT2 (std::int32_t, count1, count2)                 \
+      _THRUST_INDEX_TYPE_DISPATCH2(std::int32_t, status, call, count1, count2, arguments) \
+    else                                                                                  \
+      _THRUST_INDEX_TYPE_DISPATCH2(std::int64_t, status, call, count1, count2, arguments)
+
+//! Dispatch between 32-bit and 64-bit index_type based versions of the same algorithm implementation. This version
+//! allows using different token sequences for callables in both branches, and is intended to be used with CUB-style
+//! dispatch interfaces, where the "simple" interface always forces the size to be `int` (making it harder for us to
+//! use), but the complex interface that we end up using doesn't actually provide a way to fully deduce the type from
+//! just the call, making the size type appear in the token sequence of the callable.
+//!
+//!  See reduce_n_impl to see an example of how this is meant to be used.
+#  define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments)  \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                             \
+    if _THRUST_INDEX_TYPE_DISPATCH_SELECT (std::int32_t, count)                    \
+      _THRUST_INDEX_TYPE_DISPATCH(std::int32_t, status, call_32, count, arguments) \
+    else                                                                           \
+      _THRUST_INDEX_TYPE_DISPATCH(std::int64_t, status, call_64, count, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts.
+#  define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2)                                  \
+    if _THRUST_INDEX_TYPE_DISPATCH_SELECT2 (std::int32_t, count1, count2)                         \
+      _THRUST_INDEX_TYPE_DISPATCH2(std::int32_t, status, call_32, count1, count2, arguments)      \
+    else                                                                                          \
+      _THRUST_INDEX_TYPE_DISPATCH2(std::int64_t, status, call_64, count1, count2, arguments)
+
+//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but dispatching to uint32_t and uint64_t, respectively, depending on the
+//! `count` argument. `count` must not be negative.
+#  define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
+    _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                                     \
+    if _THRUST_INDEX_TYPE_DISPATCH_SELECT (std::uint32_t, count)                           \
+      _THRUST_INDEX_TYPE_DISPATCH(std::uint32_t, status, call_32, count, arguments)        \
+    else                                                                                   \
+      _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments)
+
+#endif // !THRUST_FORCE_32_BIT_OFFSET_TYPE
diff --git a/thrust/thrust/system/cuda/detail/equal.h b/thrust/thrust/system/cuda/detail/equal.h
index bead6deac5..b16b7d695b 100644
--- a/thrust/thrust/system/cuda/detail/equal.h
+++ b/thrust/thrust/system/cuda/detail/equal.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/system/cuda/detail/mismatch.h>
diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h
index 1829aa6580..c987b4800c 100644
--- a/thrust/thrust/system/cuda/detail/extrema.h
+++ b/thrust/thrust/system/cuda/detail/extrema.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/fill.h b/thrust/thrust/system/cuda/detail/fill.h
index 558164f16c..478626e451 100644
--- a/thrust/thrust/system/cuda/detail/fill.h
+++ b/thrust/thrust/system/cuda/detail/fill.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/distance.h>
 #  include <thrust/system/cuda/detail/parallel_for.h>
 #  include <thrust/system/cuda/detail/util.h>
diff --git a/thrust/thrust/system/cuda/detail/find.h b/thrust/thrust/system/cuda/detail/find.h
index c6bfc524e0..e492142663 100644
--- a/thrust/thrust/system/cuda/detail/find.h
+++ b/thrust/thrust/system/cuda/detail/find.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/detail/minmax.h>
diff --git a/thrust/thrust/system/cuda/detail/for_each.h b/thrust/thrust/system/cuda/detail/for_each.h
index e7dcd59961..e14c7e2799 100644
--- a/thrust/thrust/system/cuda/detail/for_each.h
+++ b/thrust/thrust/system/cuda/detail/for_each.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <cub/device/device_for.cuh>
diff --git a/thrust/thrust/system/cuda/detail/gather.h b/thrust/thrust/system/cuda/detail/gather.h
index 5712aad61e..358246749e 100644
--- a/thrust/thrust/system/cuda/detail/gather.h
+++ b/thrust/thrust/system/cuda/detail/gather.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/iterator/permutation_iterator.h>
 #  include <thrust/system/cuda/detail/transform.h>
 
diff --git a/thrust/thrust/system/cuda/detail/generate.h b/thrust/thrust/system/cuda/detail/generate.h
index 22a43fa708..cb11d5f91d 100644
--- a/thrust/thrust/system/cuda/detail/generate.h
+++ b/thrust/thrust/system/cuda/detail/generate.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/distance.h>
diff --git a/thrust/thrust/system/cuda/detail/get_value.h b/thrust/thrust/system/cuda/detail/get_value.h
index 557159fa7e..5178126b22 100644
--- a/thrust/thrust/system/cuda/detail/get_value.h
+++ b/thrust/thrust/system/cuda/detail/get_value.h
@@ -26,7 +26,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/detail/raw_pointer_cast.h>
diff --git a/thrust/thrust/system/cuda/detail/inner_product.h b/thrust/thrust/system/cuda/detail/inner_product.h
index 4b5c1a18ae..186aa21876 100644
--- a/thrust/thrust/system/cuda/detail/inner_product.h
+++ b/thrust/thrust/system/cuda/detail/inner_product.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/detail/minmax.h>
 #  include <thrust/distance.h>
 #  include <thrust/system/cuda/detail/reduce.h>
diff --git a/thrust/thrust/system/cuda/detail/internal/copy_cross_system.h b/thrust/thrust/system/cuda/detail/internal/copy_cross_system.h
index 0a9aa2cd7c..9239a3ea83 100644
--- a/thrust/thrust/system/cuda/detail/internal/copy_cross_system.h
+++ b/thrust/thrust/system/cuda/detail/internal/copy_cross_system.h
@@ -137,7 +137,7 @@ OutputIt _CCCL_HOST cross_system_copy_n(
   return ret;
 }
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 // non-trivial copy D->H, only supported with NVCC compiler
 // because copy ctor must have  __device__ annotations, which is nvcc-only
 // feature
diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
index 6ca77833bb..3f4f148f5d 100644
--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
@@ -37,7 +37,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/distance.h>
diff --git a/thrust/thrust/system/cuda/detail/iter_swap.h b/thrust/thrust/system/cuda/detail/iter_swap.h
index a63702a3d0..2aee2a31f8 100644
--- a/thrust/thrust/system/cuda/detail/iter_swap.h
+++ b/thrust/thrust/system/cuda/detail/iter_swap.h
@@ -26,7 +26,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/merge.h b/thrust/thrust/system/cuda/detail/merge.h
index c08fc94fff..a9084c9249 100644
--- a/thrust/thrust/system/cuda/detail/merge.h
+++ b/thrust/thrust/system/cuda/detail/merge.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <cub/device/device_merge.cuh>
 
diff --git a/thrust/thrust/system/cuda/detail/mismatch.h b/thrust/thrust/system/cuda/detail/mismatch.h
index 5be4af07af..c7f6afe62b 100644
--- a/thrust/thrust/system/cuda/detail/mismatch.h
+++ b/thrust/thrust/system/cuda/detail/mismatch.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/distance.h>
diff --git a/thrust/thrust/system/cuda/detail/parallel_for.h b/thrust/thrust/system/cuda/detail/parallel_for.h
index 96f0735cc1..213fb4096d 100644
--- a/thrust/thrust/system/cuda/detail/parallel_for.h
+++ b/thrust/thrust/system/cuda/detail/parallel_for.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h
index 20b0fcad84..32aa29366f 100644
--- a/thrust/thrust/system/cuda/detail/partition.h
+++ b/thrust/thrust/system/cuda/detail/partition.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/per_device_resource.h b/thrust/thrust/system/cuda/detail/per_device_resource.h
index 0bb0909128..95ee7bcc54 100644
--- a/thrust/thrust/system/cuda/detail/per_device_resource.h
+++ b/thrust/thrust/system/cuda/detail/per_device_resource.h
@@ -37,7 +37,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h
index 766b8de451..e66c90883c 100644
--- a/thrust/thrust/system/cuda/detail/reduce.h
+++ b/thrust/thrust/system/cuda/detail/reduce.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h
index d5ec566e8d..526ae8b7dc 100644
--- a/thrust/thrust/system/cuda/detail/reduce_by_key.h
+++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/remove.h b/thrust/thrust/system/cuda/detail/remove.h
index 567fd373ae..0c1a67592c 100644
--- a/thrust/thrust/system/cuda/detail/remove.h
+++ b/thrust/thrust/system/cuda/detail/remove.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/detail/copy_if.h>
 
 THRUST_NAMESPACE_BEGIN
diff --git a/thrust/thrust/system/cuda/detail/replace.h b/thrust/thrust/system/cuda/detail/replace.h
index 70385f7003..717d92ef8c 100644
--- a/thrust/thrust/system/cuda/detail/replace.h
+++ b/thrust/thrust/system/cuda/detail/replace.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/detail/internal_functional.h>
 #  include <thrust/system/cuda/detail/transform.h>
 
diff --git a/thrust/thrust/system/cuda/detail/reverse.h b/thrust/thrust/system/cuda/detail/reverse.h
index cbad90e5ae..f9b3fabd72 100644
--- a/thrust/thrust/system/cuda/detail/reverse.h
+++ b/thrust/thrust/system/cuda/detail/reverse.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/detail/execution_policy.h>
 
 THRUST_NAMESPACE_BEGIN
diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h
index e225f2cfe4..ad16bc2c46 100644
--- a/thrust/thrust/system/cuda/detail/scan.h
+++ b/thrust/thrust/system/cuda/detail/scan.h
@@ -36,12 +36,13 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
 #  include <cub/device/device_scan.cuh>
 
+#  include <thrust/detail/integer_math.h>
 #  include <thrust/detail/temporary_array.h>
 #  include <thrust/detail/type_traits.h>
 #  include <thrust/distance.h>
@@ -63,16 +64,22 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
   thrust::cuda_cub::execution_policy<Derived>& policy, InputIt first, Size num_items, OutputIt result, ScanOp scan_op)
 {
   using AccumT     = typename thrust::iterator_traits<InputIt>::value_type;
-  using Dispatch32 = cub::DispatchScan<InputIt, OutputIt, ScanOp, cub::NullType, std::int32_t, AccumT>;
-  using Dispatch64 = cub::DispatchScan<InputIt, OutputIt, ScanOp, cub::NullType, std::int64_t, AccumT>;
+  using Dispatch32 = cub::DispatchScan<InputIt, OutputIt, ScanOp, cub::NullType, std::uint32_t, AccumT>;
+  using Dispatch64 = cub::DispatchScan<InputIt, OutputIt, ScanOp, cub::NullType, std::uint64_t, AccumT>;
 
   cudaStream_t stream = thrust::cuda_cub::stream(policy);
   cudaError_t status;
 
+  // Negative number of items are normalized to `0`
+  if (thrust::detail::is_negative(num_items))
+  {
+    num_items = 0;
+  }
+
   // Determine temporary storage requirements:
   size_t tmp_size = 0;
   {
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,
@@ -88,7 +95,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
   {
     // Allocate temporary storage:
     thrust::detail::temporary_array<std::uint8_t, Derived> tmp{policy, tmp_size};
-    THRUST_INDEX_TYPE_DISPATCH2(
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
       status,
       Dispatch32::Dispatch,
       Dispatch64::Dispatch,
@@ -104,7 +111,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
 
 _CCCL_EXEC_CHECK_DISABLE
 template <typename Derived, typename InputIt, typename Size, typename OutputIt, typename InitValueT, typename ScanOp>
-_CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl(
+_CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(
   thrust::cuda_cub::execution_policy<Derived>& policy,
   InputIt first,
   Size num_items,
@@ -113,12 +120,37 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl(
   ScanOp scan_op)
 {
   using InputValueT = cub::detail::InputValue<InitValueT>;
-  using Dispatch32  = cub::DispatchScan<InputIt, OutputIt, ScanOp, InputValueT, std::int32_t, InitValueT>;
-  using Dispatch64  = cub::DispatchScan<InputIt, OutputIt, ScanOp, InputValueT, std::int64_t, InitValueT>;
+  using AccumT      = typename ::cuda::std::__accumulator_t<ScanOp, cub::detail::value_t<InputIt>, InitValueT>;
+  constexpr bool ForceInclusive = true;
+
+  using Dispatch32 =
+    cub::DispatchScan<InputIt,
+                      OutputIt,
+                      ScanOp,
+                      InputValueT,
+                      std::int32_t,
+                      AccumT,
+                      cub::DeviceScanPolicy<AccumT, ScanOp>,
+                      ForceInclusive>;
+  using Dispatch64 =
+    cub::DispatchScan<InputIt,
+                      OutputIt,
+                      ScanOp,
+                      InputValueT,
+                      std::int64_t,
+                      AccumT,
+                      cub::DeviceScanPolicy<AccumT, ScanOp>,
+                      ForceInclusive>;
 
   cudaStream_t stream = thrust::cuda_cub::stream(policy);
   cudaError_t status;
 
+  // Negative number of items are normalized to `0`
+  if (thrust::detail::is_negative(num_items))
+  {
+    num_items = 0;
+  }
+
   // Determine temporary storage requirements:
   size_t tmp_size = 0;
   {
@@ -131,7 +163,7 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl(
     thrust::cuda_cub::throw_on_error(
       status,
       "after determining tmp storage "
-      "requirements for exclusive_scan");
+      "requirements for inclusive_scan");
   }
 
   // Run scan:
@@ -144,6 +176,62 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl(
       Dispatch64::Dispatch,
       num_items,
       (tmp.data().get(), tmp_size, first, result, scan_op, InputValueT(init), num_items_fixed, stream));
+    thrust::cuda_cub::throw_on_error(status, "after dispatching inclusive_scan kernel");
+    thrust::cuda_cub::throw_on_error(
+      thrust::cuda_cub::synchronize_optional(policy), "inclusive_scan failed to synchronize");
+  }
+
+  return result + num_items;
+}
+
+_CCCL_EXEC_CHECK_DISABLE
+template <typename Derived, typename InputIt, typename Size, typename OutputIt, typename InitValueT, typename ScanOp>
+_CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl(
+  thrust::cuda_cub::execution_policy<Derived>& policy,
+  InputIt first,
+  Size num_items,
+  OutputIt result,
+  InitValueT init,
+  ScanOp scan_op)
+{
+  using InputValueT = cub::detail::InputValue<InitValueT>;
+  using Dispatch32  = cub::DispatchScan<InputIt, OutputIt, ScanOp, InputValueT, std::uint32_t, InitValueT>;
+  using Dispatch64  = cub::DispatchScan<InputIt, OutputIt, ScanOp, InputValueT, std::uint64_t, InitValueT>;
+
+  cudaStream_t stream = thrust::cuda_cub::stream(policy);
+  cudaError_t status;
+
+  // Negative number of items are normalized to `0`
+  if (thrust::detail::is_negative(num_items))
+  {
+    num_items = 0;
+  }
+
+  // Determine temporary storage requirements:
+  size_t tmp_size = 0;
+  {
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
+      status,
+      Dispatch32::Dispatch,
+      Dispatch64::Dispatch,
+      num_items,
+      (nullptr, tmp_size, first, result, scan_op, InputValueT(init), num_items_fixed, stream));
+    thrust::cuda_cub::throw_on_error(
+      status,
+      "after determining tmp storage "
+      "requirements for exclusive_scan");
+  }
+
+  // Run scan:
+  {
+    // Allocate temporary storage:
+    thrust::detail::temporary_array<std::uint8_t, Derived> tmp{policy, tmp_size};
+    THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(
+      status,
+      Dispatch32::Dispatch,
+      Dispatch64::Dispatch,
+      num_items,
+      (tmp.data().get(), tmp_size, first, result, scan_op, InputValueT(init), num_items_fixed, stream));
     thrust::cuda_cub::throw_on_error(status, "after dispatching exclusive_scan kernel");
     thrust::cuda_cub::throw_on_error(
       thrust::cuda_cub::synchronize_optional(policy), "exclusive_scan failed to synchronize");
@@ -158,7 +246,22 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl(
 // Thrust API entry points
 //-------------------------
 
-_CCCL_EXEC_CHECK_DISABLE
+template <typename Derived, typename InputIt, typename Size, typename OutputIt, typename T, typename ScanOp>
+_CCCL_HOST_DEVICE OutputIt inclusive_scan_n(
+  thrust::cuda_cub::execution_policy<Derived>& policy,
+  InputIt first,
+  Size num_items,
+  OutputIt result,
+  T init,
+  ScanOp scan_op)
+{
+  THRUST_CDP_DISPATCH(
+    (result = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy, first, num_items, result, init, scan_op);),
+    (result =
+       thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)), first, first + num_items, result, init, scan_op);));
+  return result;
+}
+
 template <typename Derived, typename InputIt, typename Size, typename OutputIt, typename ScanOp>
 _CCCL_HOST_DEVICE OutputIt inclusive_scan_n(
   thrust::cuda_cub::execution_policy<Derived>& policy, InputIt first, Size num_items, OutputIt result, ScanOp scan_op)
@@ -178,6 +281,20 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan(
   return thrust::cuda_cub::inclusive_scan_n(policy, first, num_items, result, scan_op);
 }
 
+template <typename Derived, typename InputIt, typename OutputIt, typename T, typename ScanOp>
+_CCCL_HOST_DEVICE OutputIt inclusive_scan(
+  thrust::cuda_cub::execution_policy<Derived>& policy,
+  InputIt first,
+  InputIt last,
+  OutputIt result,
+  T init,
+  ScanOp scan_op)
+{
+  using diff_t           = typename thrust::iterator_traits<InputIt>::difference_type;
+  diff_t const num_items = thrust::distance(first, last);
+  return thrust::cuda_cub::inclusive_scan_n(policy, first, num_items, result, init, scan_op);
+}
+
 template <typename Derived, typename InputIt, typename OutputIt>
 _CCCL_HOST_DEVICE OutputIt
 inclusive_scan(thrust::cuda_cub::execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
diff --git a/thrust/thrust/system/cuda/detail/scan_by_key.h b/thrust/thrust/system/cuda/detail/scan_by_key.h
index a81b3e26c9..6c64d223a9 100644
--- a/thrust/thrust/system/cuda/detail/scan_by_key.h
+++ b/thrust/thrust/system/cuda/detail/scan_by_key.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/scatter.h b/thrust/thrust/system/cuda/detail/scatter.h
index 03cf04b662..07d9439c47 100644
--- a/thrust/thrust/system/cuda/detail/scatter.h
+++ b/thrust/thrust/system/cuda/detail/scatter.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/iterator/permutation_iterator.h>
 #  include <thrust/system/cuda/detail/transform.h>
 
diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h
index 49fc3f9b50..63cd8b6fcb 100644
--- a/thrust/thrust/system/cuda/detail/set_operations.h
+++ b/thrust/thrust/system/cuda/detail/set_operations.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/detail/alignment.h>
 #  include <thrust/detail/mpl/math.h>
diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h
index c794ee7d8e..02168dd4e9 100644
--- a/thrust/thrust/system/cuda/detail/sort.h
+++ b/thrust/thrust/system/cuda/detail/sort.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <cub/device/device_merge_sort.cuh>
@@ -299,7 +299,16 @@ namespace __smart_sort
 template <class Key, class CompareOp>
 using can_use_primitive_sort = ::cuda::std::integral_constant<
   bool,
-  ::cuda::std::is_arithmetic<Key>::value
+  (::cuda::std::is_arithmetic<Key>::value
+#  if defined(_CCCL_HAS_NVFP16) && !defined(__CUDA_NO_HALF_OPERATORS__) && !defined(__CUDA_NO_HALF_CONVERSIONS__)
+   || ::cuda::std::is_same<Key, __half>::value
+#  endif // defined(_CCCL_HAS_NVFP16) && !defined(__CUDA_NO_HALF_OPERATORS__) && !defined(__CUDA_NO_HALF_CONVERSIONS__)
+#  if defined(_CCCL_HAS_NVBF16) && !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) \
+    && !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+   || ::cuda::std::is_same<Key, __nv_bfloat16>::value
+#  endif // defined(_CCCL_HAS_NVBF16) && !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) &&
+         // !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+   )
     && (::cuda::std::is_same<CompareOp, thrust::less<Key>>::value
         || ::cuda::std::is_same<CompareOp, ::cuda::std::less<Key>>::value
         || ::cuda::std::is_same<CompareOp, thrust::less<void>>::value
diff --git a/thrust/thrust/system/cuda/detail/swap_ranges.h b/thrust/thrust/system/cuda/detail/swap_ranges.h
index 480825c963..e1d642574c 100644
--- a/thrust/thrust/system/cuda/detail/swap_ranges.h
+++ b/thrust/thrust/system/cuda/detail/swap_ranges.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/distance.h>
 #  include <thrust/swap.h>
 #  include <thrust/system/cuda/detail/par_to_seq.h>
diff --git a/thrust/thrust/system/cuda/detail/tabulate.h b/thrust/thrust/system/cuda/detail/tabulate.h
index e589e7239f..94817f696a 100644
--- a/thrust/thrust/system/cuda/detail/tabulate.h
+++ b/thrust/thrust/system/cuda/detail/tabulate.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/distance.h>
diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h
index 0514a7aa9e..9e1d0b2a31 100644
--- a/thrust/thrust/system/cuda/detail/transform.h
+++ b/thrust/thrust/system/cuda/detail/transform.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/detail/type_traits/result_of_adaptable_function.h>
diff --git a/thrust/thrust/system/cuda/detail/transform_reduce.h b/thrust/thrust/system/cuda/detail/transform_reduce.h
index fd2b3eb1e1..248423cac2 100644
--- a/thrust/thrust/system/cuda/detail/transform_reduce.h
+++ b/thrust/thrust/system/cuda/detail/transform_reduce.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/system/cuda/config.h>
 
 #  include <cub/device/device_reduce.cuh>
diff --git a/thrust/thrust/system/cuda/detail/transform_scan.h b/thrust/thrust/system/cuda/detail/transform_scan.h
index bd0e4315dd..0dab1a98d7 100644
--- a/thrust/thrust/system/cuda/detail/transform_scan.h
+++ b/thrust/thrust/system/cuda/detail/transform_scan.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/detail/type_traits.h>
 #  include <thrust/distance.h>
 #  include <thrust/system/cuda/detail/scan.h>
@@ -69,6 +69,28 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan(
   return cuda_cub::inclusive_scan_n(policy, transformed_iterator_t(first, transform_op), num_items, result, scan_op);
 }
 
+template <class Derived, class InputIt, class OutputIt, class TransformOp, class InitialValueType, class ScanOp>
+OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan(
+  execution_policy<Derived>& policy,
+  InputIt first,
+  InputIt last,
+  OutputIt result,
+  TransformOp transform_op,
+  InitialValueType init,
+  ScanOp scan_op)
+{
+  using input_type  = typename thrust::iterator_value<InputIt>::type;
+  using result_type = thrust::detail::invoke_result_t<TransformOp, input_type>;
+  using value_type  = thrust::remove_cvref_t<result_type>;
+
+  using size_type              = typename iterator_traits<InputIt>::difference_type;
+  size_type num_items          = static_cast<size_type>(thrust::distance(first, last));
+  using transformed_iterator_t = transform_input_iterator_t<value_type, InputIt, TransformOp>;
+
+  return cuda_cub::inclusive_scan_n(
+    policy, transformed_iterator_t(first, transform_op), num_items, result, init, scan_op);
+}
+
 template <class Derived, class InputIt, class OutputIt, class TransformOp, class InitialValueType, class ScanOp>
 OutputIt _CCCL_HOST_DEVICE transform_exclusive_scan(
   execution_policy<Derived>& policy,
diff --git a/thrust/thrust/system/cuda/detail/uninitialized_copy.h b/thrust/thrust/system/cuda/detail/uninitialized_copy.h
index b6a3416f54..48dc891477 100644
--- a/thrust/thrust/system/cuda/detail/uninitialized_copy.h
+++ b/thrust/thrust/system/cuda/detail/uninitialized_copy.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/distance.h>
 #  include <thrust/system/cuda/detail/execution_policy.h>
 #  include <thrust/system/cuda/detail/parallel_for.h>
diff --git a/thrust/thrust/system/cuda/detail/uninitialized_fill.h b/thrust/thrust/system/cuda/detail/uninitialized_fill.h
index 7048dbe7ac..a3efdda843 100644
--- a/thrust/thrust/system/cuda/detail/uninitialized_fill.h
+++ b/thrust/thrust/system/cuda/detail/uninitialized_fill.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 #  include <thrust/distance.h>
 #  include <thrust/system/cuda/detail/execution_policy.h>
 #  include <thrust/system/cuda/detail/parallel_for.h>
diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h
index 376a956ba4..1cf58247d3 100644
--- a/thrust/thrust/system/cuda/detail/unique.h
+++ b/thrust/thrust/system/cuda/detail/unique.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/cuda/detail/unique_by_key.h b/thrust/thrust/system/cuda/detail/unique_by_key.h
index 13de762cd4..60df7bd8fa 100644
--- a/thrust/thrust/system/cuda/detail/unique_by_key.h
+++ b/thrust/thrust/system/cuda/detail/unique_by_key.h
@@ -36,7 +36,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#ifdef _CCCL_CUDA_COMPILER
 
 #  include <thrust/system/cuda/config.h>
 
diff --git a/thrust/thrust/system/detail/generic/transform_scan.h b/thrust/thrust/system/detail/generic/transform_scan.h
index 72109f3cc4..4098a2315c 100644
--- a/thrust/thrust/system/detail/generic/transform_scan.h
+++ b/thrust/thrust/system/detail/generic/transform_scan.h
@@ -48,6 +48,21 @@ _CCCL_HOST_DEVICE OutputIterator transform_inclusive_scan(
   UnaryFunction unary_op,
   BinaryFunction binary_op);
 
+template <typename ExecutionPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename UnaryFunction,
+          typename T,
+          typename BinaryFunction>
+_CCCL_HOST_DEVICE OutputIterator transform_inclusive_scan(
+  thrust::execution_policy<ExecutionPolicy>& exec,
+  InputIterator first,
+  InputIterator last,
+  OutputIterator result,
+  UnaryFunction unary_op,
+  T init,
+  BinaryFunction binary_op);
+
 template <typename ExecutionPolicy,
           typename InputIterator,
           typename OutputIterator,
diff --git a/thrust/thrust/system/detail/generic/transform_scan.inl b/thrust/thrust/system/detail/generic/transform_scan.inl
index 9461585075..5f70c79c07 100644
--- a/thrust/thrust/system/detail/generic/transform_scan.inl
+++ b/thrust/thrust/system/detail/generic/transform_scan.inl
@@ -64,6 +64,31 @@ _CCCL_HOST_DEVICE OutputIterator transform_inclusive_scan(
   return thrust::inclusive_scan(exec, _first, _last, result, binary_op);
 } // end transform_inclusive_scan()
 
+template <typename ExecutionPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename UnaryFunction,
+          typename InitialValueType,
+          typename BinaryFunction>
+_CCCL_HOST_DEVICE OutputIterator transform_inclusive_scan(
+  thrust::execution_policy<ExecutionPolicy>& exec,
+  InputIterator first,
+  InputIterator last,
+  OutputIterator result,
+  UnaryFunction unary_op,
+  InitialValueType init,
+  BinaryFunction binary_op)
+{
+  using InputType  = typename thrust::iterator_value<InputIterator>::type;
+  using ResultType = thrust::detail::invoke_result_t<UnaryFunction, InputType>;
+  using ValueType  = thrust::remove_cvref_t<ResultType>;
+
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
+
+  return thrust::inclusive_scan(exec, _first, _last, result, init, binary_op);
+} // end transform_inclusive_scan()
+
 template <typename ExecutionPolicy,
           typename InputIterator,
           typename OutputIterator,
diff --git a/thrust/thrust/system/detail/sequential/general_copy.h b/thrust/thrust/system/detail/sequential/general_copy.h
index 69b7acaaf3..ec91d1b7d7 100644
--- a/thrust/thrust/system/detail/sequential/general_copy.h
+++ b/thrust/thrust/system/detail/sequential/general_copy.h
@@ -85,7 +85,7 @@ _CCCL_EXEC_CHECK_DISABLE
 template <typename InputIterator, typename OutputIterator>
 _CCCL_HOST_DEVICE OutputIterator general_copy(InputIterator first, InputIterator last, OutputIterator result)
 {
-  for (; first != last; ++first, ++result)
+  for (; first != last; ++first, (void) ++result)
   {
     // gcc 4.2 crashes while instantiating iter_assign
 #if defined(_CCCL_COMPILER_GCC) && (THRUST_GCC_VERSION < 40300)
@@ -102,7 +102,7 @@ _CCCL_EXEC_CHECK_DISABLE
 template <typename InputIterator, typename Size, typename OutputIterator>
 _CCCL_HOST_DEVICE OutputIterator general_copy_n(InputIterator first, Size n, OutputIterator result)
 {
-  for (; n > Size(0); ++first, ++result, --n)
+  for (; n > Size(0); ++first, (void) ++result, (void) --n)
   {
     // gcc 4.2 crashes while instantiating iter_assign
 #if defined(_CCCL_COMPILER_GCC) && (THRUST_GCC_VERSION < 40300)
diff --git a/thrust/thrust/system/detail/sequential/partition.h b/thrust/thrust/system/detail/sequential/partition.h
index 5c9a3cfa32..3a869a8eea 100644
--- a/thrust/thrust/system/detail/sequential/partition.h
+++ b/thrust/thrust/system/detail/sequential/partition.h
@@ -210,7 +210,7 @@ _CCCL_HOST_DEVICE ForwardIterator stable_partition(
   TempRange temp(exec, first, last);
 
   InputIterator stencil_iter = stencil;
-  for (TempIterator iter = temp.begin(); iter != temp.end(); ++iter, ++stencil_iter)
+  for (TempIterator iter = temp.begin(); iter != temp.end(); ++iter, (void) ++stencil_iter)
   {
     if (wrapped_pred(*stencil_iter))
     {
@@ -222,7 +222,7 @@ _CCCL_HOST_DEVICE ForwardIterator stable_partition(
   ForwardIterator middle = first;
   stencil_iter           = stencil;
 
-  for (TempIterator iter = temp.begin(); iter != temp.end(); ++iter, ++stencil_iter)
+  for (TempIterator iter = temp.begin(); iter != temp.end(); ++iter, (void) ++stencil_iter)
   {
     if (!wrapped_pred(*stencil_iter))
     {
@@ -287,7 +287,7 @@ _CCCL_HOST_DEVICE thrust::pair<OutputIterator1, OutputIterator2> stable_partitio
   // wrap pred
   thrust::detail::wrapped_function<Predicate, bool> wrapped_pred{pred};
 
-  for (; first != last; ++first, ++stencil)
+  for (; first != last; ++first, (void) ++stencil)
   {
     if (wrapped_pred(*stencil))
     {
diff --git a/thrust/thrust/system/detail/sequential/reduce_by_key.h b/thrust/thrust/system/detail/sequential/reduce_by_key.h
index 614d192d32..2318c015cf 100644
--- a/thrust/thrust/system/detail/sequential/reduce_by_key.h
+++ b/thrust/thrust/system/detail/sequential/reduce_by_key.h
@@ -66,7 +66,7 @@ _CCCL_HOST_DEVICE thrust::pair<OutputIterator1, OutputIterator2> reduce_by_key(
     InputKeyType temp_key    = *keys_first;
     TemporaryType temp_value = *values_first;
 
-    for (++keys_first, ++values_first; keys_first != keys_last; ++keys_first, ++values_first)
+    for (++keys_first, ++values_first; keys_first != keys_last; ++keys_first, (void) ++values_first)
     {
       InputKeyType key     = *keys_first;
       InputValueType value = *values_first;
diff --git a/thrust/thrust/system/detail/sequential/scan.h b/thrust/thrust/system/detail/sequential/scan.h
index b76074c829..86a132ba3e 100644
--- a/thrust/thrust/system/detail/sequential/scan.h
+++ b/thrust/thrust/system/detail/sequential/scan.h
@@ -35,6 +35,8 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
+#include <cuda/std/__functional/invoke.h>
+
 THRUST_NAMESPACE_BEGIN
 namespace system
 {
@@ -66,9 +68,49 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan(
 
     *result = *first;
 
-    for (++first, ++result; first != last; ++first, ++result)
+    for (++first, ++result; first != last; ++first, (void) ++result)
+    {
+      *result = sum = wrapped_binary_op(sum, *first);
+    }
+  }
+
+  return result;
+}
+
+_CCCL_EXEC_CHECK_DISABLE
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename InitialValueType,
+          typename BinaryFunction>
+_CCCL_HOST_DEVICE OutputIterator inclusive_scan(
+  sequential::execution_policy<DerivedPolicy>&,
+  InputIterator first,
+  InputIterator last,
+  OutputIterator result,
+  InitialValueType init,
+  BinaryFunction binary_op)
+{
+  using namespace thrust::detail;
+
+  using ValueType = typename ::cuda::std::
+    __accumulator_t<BinaryFunction, typename ::cuda::std::iterator_traits<InputIterator>::value_type, InitialValueType>;
+
+  // wrap binary_op
+  thrust::detail::wrapped_function<BinaryFunction, ValueType> wrapped_binary_op{binary_op};
+
+  if (first != last)
+  {
+    ValueType sum = wrapped_binary_op(init, *first);
+    *result       = sum;
+    ++first;
+    ++result;
+
+    while (first != last)
     {
       *result = sum = wrapped_binary_op(sum, *first);
+      ++first;
+      ++result;
     }
   }
 
@@ -102,7 +144,7 @@ _CCCL_HOST_DEVICE OutputIterator exclusive_scan(
     *result = sum;
     sum     = binary_op(sum, tmp);
 
-    for (++first, ++result; first != last; ++first, ++result)
+    for (++first, ++result; first != last; ++first, (void) ++result)
     {
       tmp     = *first;
       *result = sum;
diff --git a/thrust/thrust/system/detail/sequential/scan_by_key.h b/thrust/thrust/system/detail/sequential/scan_by_key.h
index 3f1e7bb49c..0bb3284409 100644
--- a/thrust/thrust/system/detail/sequential/scan_by_key.h
+++ b/thrust/thrust/system/detail/sequential/scan_by_key.h
@@ -70,7 +70,7 @@ _CCCL_HOST_DEVICE OutputIterator inclusive_scan_by_key(
 
     *result = prev_value;
 
-    for (++first1, ++first2, ++result; first1 != last1; ++first1, ++first2, ++result)
+    for (++first1, ++first2, ++result; first1 != last1; ++first1, (void) ++first2, (void) ++result)
     {
       KeyType key = *first1;
 
@@ -123,7 +123,7 @@ _CCCL_HOST_DEVICE OutputIterator exclusive_scan_by_key(
 
     next = binary_op(next, temp_value);
 
-    for (++first1, ++first2, ++result; first1 != last1; ++first1, ++first2, ++result)
+    for (++first1, ++first2, ++result; first1 != last1; ++first1, (void) ++first2, (void) ++result)
     {
       KeyType key = *first1;
 
diff --git a/thrust/thrust/system/detail/sequential/unique_by_key.h b/thrust/thrust/system/detail/sequential/unique_by_key.h
index 490135195c..b1213d0979 100644
--- a/thrust/thrust/system/detail/sequential/unique_by_key.h
+++ b/thrust/thrust/system/detail/sequential/unique_by_key.h
@@ -65,7 +65,7 @@ _CCCL_HOST_DEVICE thrust::pair<OutputIterator1, OutputIterator2> unique_by_key_c
     InputKeyType temp_key      = *keys_first;
     OutputValueType temp_value = *values_first;
 
-    for (++keys_first, ++values_first; keys_first != keys_last; ++keys_first, ++values_first)
+    for (++keys_first, ++values_first; keys_first != keys_last; ++keys_first, (void) ++values_first)
     {
       InputKeyType key      = *keys_first;
       OutputValueType value = *values_first;
diff --git a/thrust/thrust/system/tbb/detail/scan.h b/thrust/thrust/system/tbb/detail/scan.h
index b0f005cb38..18a3ec3ee2 100644
--- a/thrust/thrust/system/tbb/detail/scan.h
+++ b/thrust/thrust/system/tbb/detail/scan.h
@@ -43,6 +43,10 @@ template <typename InputIterator, typename OutputIterator, typename BinaryFuncti
 OutputIterator
 inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator result, BinaryFunction binary_op);
 
+template <typename InputIterator, typename OutputIterator, typename T, typename BinaryFunction>
+OutputIterator
+inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator result, T init, BinaryFunction binary_op);
+
 template <typename InputIterator, typename OutputIterator, typename T, typename BinaryFunction>
 OutputIterator
 exclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator result, T init, BinaryFunction binary_op);
diff --git a/thrust/thrust/system/tbb/detail/scan.inl b/thrust/thrust/system/tbb/detail/scan.inl
index 5372eda9f6..683ed22602 100644
--- a/thrust/thrust/system/tbb/detail/scan.inl
+++ b/thrust/thrust/system/tbb/detail/scan.inl
@@ -33,6 +33,8 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/tbb/detail/scan.h>
 
+#include <cuda/std/__functional/invoke.h>
+
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_scan.h>
 
@@ -46,7 +48,7 @@ namespace detail
 namespace scan_detail
 {
 
-template <typename InputIterator, typename OutputIterator, typename BinaryFunction, typename ValueType>
+template <typename InputIterator, typename OutputIterator, typename BinaryFunction, typename ValueType, bool HasInit>
 struct inclusive_body
 {
   InputIterator input;
@@ -55,11 +57,11 @@ struct inclusive_body
   ValueType sum;
   bool first_call;
 
-  inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType dummy)
+  inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init)
       : input(input)
       , output(output)
       , binary_op{binary_op}
-      , sum(dummy)
+      , sum(init)
       , first_call(true)
   {}
 
@@ -105,7 +107,14 @@ struct inclusive_body
 
     if (first_call)
     {
-      *iter2 = sum = *iter1;
+      _CCCL_IF_CONSTEXPR (HasInit)
+      {
+        *iter2 = sum = binary_op(sum, *iter1);
+      }
+      else
+      {
+        *iter2 = sum = *iter1;
+      }
       ++iter1;
       ++iter2;
       for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter1, ++iter2)
@@ -239,7 +248,7 @@ inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator resu
 
   if (n != 0)
   {
-    using Body = typename scan_detail::inclusive_body<InputIterator, OutputIterator, BinaryFunction, ValueType>;
+    using Body = typename scan_detail::inclusive_body<InputIterator, OutputIterator, BinaryFunction, ValueType, false>;
     Body scan_body(first, result, binary_op, *first);
     ::tbb::parallel_scan(::tbb::blocked_range<Size>(0, n), scan_body);
   }
@@ -249,6 +258,31 @@ inclusive_scan(tag, InputIterator first, InputIterator last, OutputIterator resu
   return result;
 }
 
+template <typename InputIterator, typename OutputIterator, typename InitialValueType, typename BinaryFunction>
+OutputIterator inclusive_scan(
+  tag, InputIterator first, InputIterator last, OutputIterator result, InitialValueType init, BinaryFunction binary_op)
+{
+  using namespace thrust::detail;
+
+  // Use the input iterator's value type and the initial value type per wg21.link/p2322
+  using ValueType = typename ::cuda::std::
+    __accumulator_t<BinaryFunction, typename ::cuda::std::iterator_traits<InputIterator>::value_type, InitialValueType>;
+
+  using Size = typename thrust::iterator_difference<InputIterator>::type;
+  Size n     = thrust::distance(first, last);
+
+  if (n != 0)
+  {
+    using Body = typename scan_detail::inclusive_body<InputIterator, OutputIterator, BinaryFunction, ValueType, true>;
+    Body scan_body(first, result, binary_op, init);
+    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0, n), scan_body);
+  }
+
+  thrust::advance(result, n);
+
+  return result;
+}
+
 template <typename InputIterator, typename OutputIterator, typename InitialValueType, typename BinaryFunction>
 OutputIterator exclusive_scan(
   tag, InputIterator first, InputIterator last, OutputIterator result, InitialValueType init, BinaryFunction binary_op)
diff --git a/thrust/thrust/transform_scan.h b/thrust/thrust/transform_scan.h
index dc1754c6c3..205d2a711e 100644
--- a/thrust/thrust/transform_scan.h
+++ b/thrust/thrust/transform_scan.h
@@ -168,6 +168,144 @@ template <typename InputIterator, typename OutputIterator, typename UnaryFunctio
 OutputIterator transform_inclusive_scan(
   InputIterator first, InputIterator last, OutputIterator result, UnaryFunction unary_op, AssociativeOperator binary_op);
 
+/*! \p transform_inclusive_scan fuses the \p transform and \p inclusive_scan
+ *  operations.  \p transform_inclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p inclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required. In \p transform_inclusive_scan,
+ *  if <tt>binary_op(init, unary_op(\*first))</tt> is <tt>accum</tt>, it is assigned to
+ *  <tt>\*result</tt> and the result of <tt>binary_op(accum,
+ *  unary_op(\*(first + 1)))</tt> is assigned to <tt>\*(result + 1)</tt>,
+ *  and so on. The transform scan operation is permitted to be in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param init The initial value of the \p transform_inclusive_scan
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input
+ *  Iterator</a> and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output
+ *  Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">
+ *  Unary Function</a> and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type is
+ *  convertable to \c OutputIterator's \c value_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of
+ *  <ahref="https://en.cppreference.com/w/cpp/utility/functional/binary_function"> Binary Function</a> and \c
+ *  AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
+ *  first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_inclusive_scan using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_inclusive_scan(thrust::host, data, data + 6, data, unary_op, 4, binary_op); // in-place scan
+ *
+ *  // data is now {3, 3, 1, -1, -2, -5}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p inclusive_scan
+ *
+ */
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename UnaryFunction,
+          typename T,
+          typename AssociativeOperator>
+_CCCL_HOST_DEVICE OutputIterator transform_inclusive_scan(
+  const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+  InputIterator first,
+  InputIterator last,
+  OutputIterator result,
+  UnaryFunction unary_op,
+  T init,
+  AssociativeOperator binary_op);
+
+/*! \p transform_inclusive_scan fuses the \p transform and \p inclusive_scan
+ *  operations.  \p transform_inclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p inclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required.In \p transform_inclusive_scan,
+ *  if <tt>binary_op(init, unary_op(\*first))</tt> is <tt>accum</tt>, it is assigned to
+ *  <tt>\*result</tt> and the result of <tt>binary_op(accum,
+ *  unary_op(\*(first + 1)))</tt> is assigned to <tt>\*(result + 1)</tt>,
+ *  and so on. The transform scan operation is permitted to be in-place.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param init The initial value of the \p transform_inclusive_scan
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input
+ *  Iterator</a> and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output
+ *  Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">
+ *  Unary Function</a> and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type is
+ * convertable to \c OutputIterator's \c value_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of
+ * <ahref="https://en.cppreference.com/w/cpp/utility/functional/binary_function"> Binary Function</a> and \c
+ * AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last -
+ * first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_inclusive_scan
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  ...
+ *
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_inclusive_scan(data, data + 6, data, unary_op, 4, binary_op); // in-place scan
+ *
+ *  // data is now {3, 3, 1, -1, -2, -5}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p inclusive_scan
+ *
+ */
+template <typename InputIterator, typename OutputIterator, typename UnaryFunction, typename T, typename AssociativeOperator>
+OutputIterator transform_inclusive_scan(
+  InputIterator first,
+  InputIterator last,
+  OutputIterator result,
+  UnaryFunction unary_op,
+  T init,
+  AssociativeOperator binary_op);
+
 /*! \p transform_exclusive_scan fuses the \p transform and \p exclusive_scan
  *  operations.  \p transform_exclusive_scan is equivalent to performing a
  *  tranformation defined by \p unary_op into a temporary sequence and then
diff --git a/thrust/thrust/type_traits/logical_metafunctions.h b/thrust/thrust/type_traits/logical_metafunctions.h
index 9f034731a8..1be7bbd06a 100644
--- a/thrust/thrust/type_traits/logical_metafunctions.h
+++ b/thrust/thrust/type_traits/logical_metafunctions.h
@@ -14,14 +14,6 @@
  *  limitations under the License.
  */
 
-/*! \file
- *  \brief C++17's
- *  <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>,
- *  <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>,
- *  and <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
- *  metafunctions and related extensions.
- */
-
 #pragma once
 
 #include <thrust/detail/config.h>
@@ -34,267 +26,84 @@
 #  pragma system_header
 #endif // no system header
 
-#include <type_traits>
+#include <cuda/std/__type_traits/conjunction.h>
+#include <cuda/std/__type_traits/disjunction.h>
+#include <cuda/std/__type_traits/negation.h>
 
 THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup utility
- *  \{
- */
-
-/*! \addtogroup type_traits Type Traits
- *  \{
- */
-
-/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
- *  whose value is <tt>(... && Ts::value)</tt>.
- *
- *  \see conjunction_v
- *  \see conjunction_value
- *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
- */
-#if _CCCL_STD_VER >= 2017
-template <typename... Ts>
-using conjunction = std::conjunction<Ts...>;
-#else // Older than C++17.
-template <typename... Ts>
-struct conjunction;
-
-/*! \cond
- */
-
-template <>
-struct conjunction<> : std::true_type
-{};
-
-template <typename T>
-struct conjunction<T> : T
-{};
-
-template <typename T0, typename T1>
-struct conjunction<T0, T1> : std::conditional<T0::value, T1, T0>::type
-{};
-
-template <typename T0, typename T1, typename T2, typename... TN>
-struct conjunction<T0, T1, T2, TN...> : std::conditional<T0::value, conjunction<T1, T2, TN...>, T0>::type
-{};
-
-/*! \endcond
- */
-#endif
-
-/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... && Ts::value)</tt>.
- *
- *  \see conjunction
- *  \see conjunction_value
- *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
- */
-#if _CCCL_STD_VER >= 2014
-template <typename... Ts>
-constexpr bool conjunction_v = conjunction<Ts...>::value;
-#endif
-
-/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
- *  whose value is <tt>(... || Ts::value)</tt>.
- *
- *  \see disjunction_v
- *  \see disjunction_value
- *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
- */
-#if _CCCL_STD_VER >= 2017
-template <typename... Ts>
-using disjunction = std::disjunction<Ts...>;
-#else // Older than C++17.
-template <typename... Ts>
-struct disjunction;
-
-/*! \cond
- */
-
-template <>
-struct disjunction<> : std::false_type
-{};
-
-template <typename T>
-struct disjunction<T> : T
-{};
-
-template <typename T0, typename... TN>
-struct disjunction<T0, TN...> : std::conditional<T0::value != false, T0, disjunction<TN...>>::type
-{};
-
-/*! \endcond
- */
-#endif
-
-/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... || Ts::value)</tt>.
- *
- *  \see disjunction
- *  \see disjunction_value
- *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
- */
-#if _CCCL_STD_VER >= 2014
-template <typename... Ts>
-constexpr bool disjunction_v = disjunction<Ts...>::value;
-#endif
-
-/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
- *  whose value is <tt>!Ts::value</tt>.
- *
- *  \see negation_v
- *  \see negation_value
- *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
- */
-#if _CCCL_STD_VER >= 2017
-template <typename T>
-using negation = std::negation<T>;
-#else // Older than C++17.
-template <typename T>
-struct negation;
-
-/*! \cond
- */
-
-template <typename T>
-struct negation : std::integral_constant<bool, !T::value>
-{};
-
-/*! \endcond
- */
-#endif
-
-/*! \brief <tt>constexpr bool</tt> whose value is <tt>!Ts::value</tt>.
- *
- *  \see negation
- *  \see negation_value
- *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
- */
-#if _CCCL_STD_VER >= 2014
-template <typename T>
-constexpr bool negation_v = negation<T>::value;
+//! \addtogroup utility
+//! \{
+//! \addtogroup type_traits Type Traits
+//! \{
+
+using ::cuda::std::conjunction;
+using ::cuda::std::disjunction;
+using ::cuda::std::negation;
+#if _CCCL_STD_VER >= 2014 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES)
+using ::cuda::std::conjunction_v;
+using ::cuda::std::disjunction_v;
+using ::cuda::std::negation_v;
 #endif
 
-///////////////////////////////////////////////////////////////////////////////
-
-/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
- *  whose value is <tt>(... && Bs)</tt>.
- *
- *  \see conjunction_value_v
- *  \see conjunction
- *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
- */
+//! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+//! whose value is <tt>(... && Bs)</tt>.
+//!
+//! \see conjunction_value_v
+//! \see conjunction
+//! \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
 template <bool... Bs>
-struct conjunction_value;
+using conjunction_value = conjunction<::cuda::std::bool_constant<Bs>...>;
 
 #if _CCCL_STD_VER >= 2014
-/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... && Bs)</tt>.
- *
- *  \see conjunction_value
- *  \see conjunction
- *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
- */
+//! \brief <tt>constexpr bool</tt> whose value is <tt>(... && Bs)</tt>.
+//!
+//! \see conjunction_value
+//! \see conjunction
+//! \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
 template <bool... Bs>
 constexpr bool conjunction_value_v = conjunction_value<Bs...>::value;
 #endif
 
-/*! \cond
- */
-
-template <>
-struct conjunction_value<> : std::true_type
-{};
-
-template <bool B>
-struct conjunction_value<B> : std::integral_constant<bool, B>
-{};
-
-template <bool B, bool... Bs>
-struct conjunction_value<B, Bs...> : std::integral_constant<bool, B && conjunction_value<Bs...>::value>
-{};
-
-/*! \endcond
- */
-
-///////////////////////////////////////////////////////////////////////////////
-
-/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
- *  whose value is <tt>(... || Bs)</tt>.
- *
- *  \see disjunction_value_v
- *  \see disjunction
- *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
- */
+//! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+//! whose value is <tt>(... || Bs)</tt>.
+//!
+//! \see disjunction_value_v
+//! \see disjunction
+//! \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
 template <bool... Bs>
-struct disjunction_value;
+using disjunction_value = disjunction<::cuda::std::bool_constant<Bs>...>;
 
 #if _CCCL_STD_VER >= 2014
-/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... || Bs)</tt>.
- *
- *  \see disjunction_value
- *  \see disjunction
- *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
- */
+//! \brief <tt>constexpr bool</tt> whose value is <tt>(... || Bs)</tt>.
+//!
+//! \see disjunction_value
+//! \see disjunction
+//! \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
 template <bool... Bs>
 constexpr bool disjunction_value_v = disjunction_value<Bs...>::value;
 #endif
 
-/*! \cond
- */
-
-template <>
-struct disjunction_value<> : std::false_type
-{};
-
-template <bool B>
-struct disjunction_value<B> : std::integral_constant<bool, B>
-{};
-
-template <bool B, bool... Bs>
-struct disjunction_value<B, Bs...> : std::integral_constant<bool, B || disjunction_value<Bs...>::value>
-{};
-
-/*! \endcond
- */
-
-///////////////////////////////////////////////////////////////////////////////
-
-/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
- *  whose value is <tt>!Bs</tt>.
- *
- *  \see negation_value_v
- *  \see negation
- *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
- */
+//! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+//! whose value is <tt>!Bs</tt>.
+//!
+//! \see negation_value_v
+//! \see negation
+//! \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
 template <bool B>
-struct negation_value;
+using negation_value = ::cuda::std::bool_constant<!B>;
 
 #if _CCCL_STD_VER >= 2014
-/*! \brief <tt>constexpr bool</tt> whose value is <tt>!Ts::value</tt>.
- *
- *  \see negation_value
- *  \see negation
- *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
- */
+//! \brief <tt>constexpr bool</tt> whose value is <tt>!Ts::value</tt>.
+//!
+//! \see negation_value
+//! \see negation
+//! \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
 template <bool B>
 constexpr bool negation_value_v = negation_value<B>::value;
 #endif
 
-/*! \cond
- */
-
-template <bool B>
-struct negation_value : std::integral_constant<bool, !B>
-{};
-
-/*! \endcond
- */
-
-///////////////////////////////////////////////////////////////////////////////
-
-/*! \} // type traits
- */
-
-/*! \} // utility
- */
+//! \} // type traits
+//! \} // utility
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/version.h b/thrust/thrust/version.h
index 4899ede1d3..9eab2eb2ac 100644
--- a/thrust/thrust/version.h
+++ b/thrust/thrust/version.h
@@ -28,7 +28,7 @@
 
 #pragma once
 
-#include <thrust/detail/config/config.h>
+#include <thrust/detail/config/config.h> // IWYU pragma: export
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -38,7 +38,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/version>
+#include <cuda/version> // IWYU pragma: export
 
 //  This is the only Thrust header that is guaranteed to
 //  change with every Thrust release.
@@ -61,7 +61,7 @@
  *         <tt>THRUST_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>THRUST_VERSION / 100000</tt> is the major version.
  */
-#define THRUST_VERSION 200600 // macro expansion with ## requires this to be a single value
+#define THRUST_VERSION 200700 // macro expansion with ## requires this to be a single value
 
 /*! \def THRUST_MAJOR_VERSION
  *  \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the