From 9941a44887e7cf36491e82aec7c9d74eb3e81671 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 24 Jul 2024 17:04:22 -0700
Subject: [PATCH 01/29] Support fetch_add and CAS on 8/16b

---
 .../std/__atomic/functions/cuda_ptx_derived.h | 118 +++++++++++++-----
 .../functions/cuda_ptx_generated_helper.h     |  46 ++++---
 libcudacxx/include/cuda/std/__atomic/types.h  |   7 +-
 libcudacxx/test/atomic_codegen/CMakeLists.txt |   5 +
 .../test/atomic_codegen/atomic_add_8b.cu      |  23 ++++
 5 files changed, 152 insertions(+), 47 deletions(-)
 create mode 100644 libcudacxx/test/atomic_codegen/atomic_add_8b.cu
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 0e525bf296..25388a8273 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -13,6 +13,9 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cstdint>
+
+#include "cuda/std/__atomic/functions/cuda_ptx_generated_helper.h"
 #include "cuda_ptx_generated.h"
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -33,6 +36,92 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if defined(_CCCL_CUDA_COMPILER)
 
+template <class _Operand>
+using __atomic_cuda_enable_non_native_load = typename enable_if<_Operand::__size <= 8, bool>::type;
+
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_load<_Operand> = 0>
+static inline _CCCL_DEVICE bool
+__cuda_atomic_load(_Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
+{
+  uint16_t* __aligned     = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1));
+  const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8;
+  const uint16_t __mask   = ((1 << 8) - 1) << __offset;
+
+  uint16_t __value = 0;
+
+  __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}, __atomic_cuda_mmio_disable{});
+  __dst = static_cast<_Type>(__value >> __offset);
+}
+
+template <class _Type, class _Order, class _Sco>
+static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, __atomic_cuda_operand_b8, _Sco)
+{
+  uint16_t* __aligned     = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1));
+  const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8;
+  const uint16_t __mask   = ((1 << 8) - 1) << __offset;
+
+  // Algorithm for 8b CAS with 16b intrinsics
+  // __old = __window[0:16] where [__cmp] resides in either of the two 8b offsets
+  // First CAS attempt 'guesses' that the masked portion of the window is 0x00.
+  uint16_t __old       = (uint16_t(__op) << __offset);
+  uint16_t __old_value = 0;
+
+  bool __success = false;
+
+  // Reemit CAS instructions until either of two conditions are met
+  while (1)
+  {
+    // Combine the desired value and most recently fetched expected masked portion of the window
+    uint16_t __attempt = (__old & ~__mask) | (uint16_t(__op) << __offset);
+
+    if (__cuda_atomic_compare_exchange(
+          __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}))
+    {
+      // CAS was successful
+      return true;
+    }
+    __old_value = (__old & __mask) >> __offset;
+    // The expected value no longer matches inside the CAS.
+    if (__old_value != __cmp)
+    {
+      __dst = __old_value;
+      break;
+    }
+  }
+  return false;
+}
+
+// Lower level fetch_update that bypasses memorder dispatch
+template <class _Type, class _Fn, class _Sco, class _Order, class _Operand>
+_CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco)
+{
+  _Type __expected = 0;
+  __cuda_atomic_load(__ptr, __expected, __atomic_cuda_relaxed{}, _Operand{}, _Sco{}, __atomic_cuda_mmio_disable{});
+  _Type __desired = __op(__expected);
+  while (!__cuda_atomic_compare_exchange(__ptr, __expected, __expected, __desired, _Order{}, _Operand{}, _Sco{}))
+  {
+    __desired = __op(__expected);
+  }
+  return __expected;
+}
+
+template <class _Operand>
+using __atomic_cuda_enable_non_native_add = typename enable_if<_Operand::__size <= 16, bool>::type;
+
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_add<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old + __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
 template <typename _Tp, typename _Fn, typename _Sco>
 _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco)
 {
@@ -207,35 +296,6 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int
 //   memcpy(__ret, &__old, sizeof(__proxy_t));
 // }
 
-// template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
-//   _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder,
-//   _Sco)
-// {
-//   auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
-//   auto const __offset  = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
-//   auto const __mask    = ((1 << sizeof(_Tp) * 8) - 1) << __offset;
-
-//   uint32_t __old = *__expected << __offset;
-//   uint32_t __old_value;
-//   while (1)
-//   {
-//     __old_value = (__old & __mask) >> __offset;
-//     if (__old_value != *__expected)
-//     {
-//       break;
-//     }
-//     uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset);
-//     if (__atomic_compare_exchange_cuda(
-//           __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{}))
-//     {
-//       return true;
-//     }
-//   }
-//   *__expected = __old_value;
-//   return false;
-// }
-
 // template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
 // _CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
 // {
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
index 861e9f7b08..cc3f3f4659 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
@@ -69,8 +69,15 @@ enum class __atomic_cuda_operand
 
 template <__atomic_cuda_operand _Op, size_t _Size>
 struct __atomic_cuda_operand_tag
-{};
+{
+  static constexpr auto __op   = _Op;
+  static constexpr auto __size = _Size;
+};
 
+using __atomic_cuda_operand_f8   = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 8>;
+using __atomic_cuda_operand_s8   = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 8>;
+using __atomic_cuda_operand_u8   = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 8>;
+using __atomic_cuda_operand_b8   = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 8>;
 using __atomic_cuda_operand_f16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 16>;
 using __atomic_cuda_operand_s16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 16>;
 using __atomic_cuda_operand_u16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 16>;
@@ -103,13 +110,15 @@ struct __atomic_longlong2
 
 template <class _Type>
 using __atomic_cuda_deduce_bitwise =
-  _If<sizeof(_Type) == 2,
-      __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_b16>,
-      _If<sizeof(_Type) == 4,
-          __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_b32>,
-          _If<sizeof(_Type) == 8,
-              __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_b64>,
-              __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>;
+  _If<sizeof(_Type) == 1,
+      __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_b8>,
+      _If<sizeof(_Type) == 2,
+          __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_b16>,
+          _If<sizeof(_Type) == 4,
+              __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_b32>,
+              _If<sizeof(_Type) == 8,
+                  __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_b64>,
+                  __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>>;
 
 template <class _Type>
 using __atomic_cuda_deduce_arithmetic =
@@ -118,12 +127,21 @@ using __atomic_cuda_deduce_arithmetic =
           __atomic_cuda_operand_deduction<float, __atomic_cuda_operand_f32>,
           __atomic_cuda_operand_deduction<double, __atomic_cuda_operand_f64>>,
       _If<_CCCL_TRAIT(is_signed, _Type),
-          _If<sizeof(_Type) == 4,
-              __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
-              __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_u64>>, // There is no atom.add.s64
-          _If<sizeof(_Type) == 4,
-              __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
-              __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>;
+          _If<sizeof(_Type) == 1,
+              __atomic_cuda_operand_deduction<int8_t, __atomic_cuda_operand_s8>,
+              _If<sizeof(_Type) == 2,
+                  __atomic_cuda_operand_deduction<int16_t, __atomic_cuda_operand_s16>,
+                  _If<sizeof(_Type) == 4,
+                      __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
+                      __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_u64>>>>, // There is no
+                                                                                              // atom.add.s64
+          _If<sizeof(_Type) == 1,
+              __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_u8>,
+              _If<sizeof(_Type) == 2,
+                  __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_u16>,
+                  _If<sizeof(_Type) == 4,
+                      __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
+                      __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>>;
 
 template <class _Type>
 using __atomic_cuda_deduce_minmax =
diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
index 4b58ba4901..73e5ef0044 100644
--- a/libcudacxx/include/cuda/std/__atomic/types.h
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -32,10 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <typename _Tp>
 struct __atomic_traits
 {
-  static constexpr bool __atomic_requires_lock  = !__atomic_is_always_lock_free<_Tp>::__value;
-  static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
-  static constexpr bool __atomic_supports_reference =
-    __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8);
+  static constexpr bool __atomic_requires_lock      = !__atomic_is_always_lock_free<_Tp>::__value;
+  static constexpr bool __atomic_requires_small     = sizeof(_Tp) < 4;
+  static constexpr bool __atomic_supports_reference = __atomic_is_always_lock_free<_Tp>::__value && sizeof(_Tp) <= 8;
 };
 
 template <typename _Tp>
diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt
index 095fa41cf7..7aee09700a 100644
--- a/libcudacxx/test/atomic_codegen/CMakeLists.txt
+++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt
@@ -16,6 +16,11 @@ foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests)
         STATIC ${test_path}
     )
 
+    set_target_properties(
+        atomic_codegen_${test_name}
+        PROPERTIES CUDA_ARCHITECTURES "80"
+    )
+
     ## Important for testing the local headers
     target_include_directories(atomic_codegen_${test_name} PRIVATE "${libcudacxx_SOURCE_DIR}/include")
     add_dependencies(libcudacxx.test.atomics.ptx atomic_codegen_${test_name})
diff --git a/libcudacxx/test/atomic_codegen/atomic_add_8b.cu b/libcudacxx/test/atomic_codegen/atomic_add_8b.cu
new file mode 100644
index 0000000000..8ff6f624e3
--- /dev/null
+++ b/libcudacxx/test/atomic_codegen/atomic_add_8b.cu
@@ -0,0 +1,23 @@
+#include <cuda/atomic>
+
+__global__ void add_relaxed_device_non_volatile(uint8_t* data, uint8_t* out, uint8_t n)
+{
+  auto ref = cuda::atomic_ref<uint8_t, cuda::thread_scope_device>{*(data)};
+  *out     = ref.fetch_add(n, cuda::std::memory_order_relaxed);
+}
+
+/*
+
+; SM8X-LABEL: .target sm_80
+; SM8X:      .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]](
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
+; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
+; SM8X-NEXT: //
+; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
+; SM8X-NEXT: //
+; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
+; SM8X-NEXT: ret;
+
+*/

From bfd97ee727ba42ea327cc0c077d7f582249d7ed9 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 25 Jul 2024 12:44:08 -0700
Subject: [PATCH 02/29] Add 16b test

---
 .../test/atomic_codegen/atomic_add_16b.cu     | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 libcudacxx/test/atomic_codegen/atomic_add_16b.cu

diff --git a/libcudacxx/test/atomic_codegen/atomic_add_16b.cu b/libcudacxx/test/atomic_codegen/atomic_add_16b.cu
new file mode 100644
index 0000000000..95bc80e9a4
--- /dev/null
+++ b/libcudacxx/test/atomic_codegen/atomic_add_16b.cu
@@ -0,0 +1,23 @@
+#include <cuda/atomic>
+
+__global__ void add_relaxed_device_non_volatile(uint16_t* data, uint16_t* out, uint16_t n)
+{
+  auto ref = cuda::atomic_ref<uint16_t, cuda::thread_scope_device>{*(data)};
+  *out     = ref.fetch_add(n, cuda::std::memory_order_relaxed);
+}
+
+/*
+
+; SM8X-LABEL: .target sm_80
+; SM8X:      .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]](
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
+; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
+; SM8X-NEXT: //
+; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
+; SM8X-NEXT: //
+; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
+; SM8X-NEXT: ret;
+
+*/

From 24bb639393a682350ffca445bb674a0f09af7e31 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 2 Aug 2024 15:53:45 -0700
Subject: [PATCH 03/29] Fix issues found when enabling 8/16b in a heterogeneous
 test, PTX seems to be invalid though

---
 .../std/__atomic/functions/cuda_ptx_derived.h | 77 +++++++++++++++++--
 .../atomic/reference_cuda.pass.cpp            |  6 +-
 2 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 25388a8273..d36f5a8777 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -40,12 +40,11 @@ template <class _Operand>
 using __atomic_cuda_enable_non_native_load = typename enable_if<_Operand::__size <= 8, bool>::type;
 
 template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_load<_Operand> = 0>
-static inline _CCCL_DEVICE bool
-__cuda_atomic_load(_Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
+static inline _CCCL_DEVICE void
+__cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
   uint16_t* __aligned     = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1));
   const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8;
-  const uint16_t __mask   = ((1 << 8) - 1) << __offset;
 
   uint16_t __value = 0;
 
@@ -67,8 +66,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
   uint16_t __old       = (uint16_t(__op) << __offset);
   uint16_t __old_value = 0;
 
-  bool __success = false;
-
   // Reemit CAS instructions until either of two conditions are met
   while (1)
   {
@@ -109,6 +106,9 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or
 template <class _Operand>
 using __atomic_cuda_enable_non_native_add = typename enable_if<_Operand::__size <= 16, bool>::type;
 
+template <class _Operand>
+using __atomic_cuda_enable_non_native_exchange = typename enable_if<_Operand::__size <= 8, bool>::type;
+
 template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_add<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
@@ -122,6 +122,73 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __d
     _Sco{});
 }
 
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_add<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old & __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_add<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old ^ __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_add<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old | __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_exchange<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_exchange<_Operand> = 0>
+static inline _CCCL_DEVICE void
+__cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
+{
+  // Store requires cas on 8b types
+  __cuda_atomic_fetch_update(
+    __ptr,
+    [__val](_Type __old) {
+      return __val;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
 template <typename _Tp, typename _Fn, typename _Sco>
 _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco)
 {
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp
index ddef76ec28..fbe613fe44 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp
@@ -175,19 +175,15 @@ using bitwise_atomic_testers =
 
 void kernel_invoker()
 {
-// todo
-#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
+  // todo
   validate_pinned<signed char, arithmetic_atomic_testers>();
   validate_pinned<signed short, arithmetic_atomic_testers>();
-#endif
   validate_pinned<signed int, arithmetic_atomic_testers>();
   validate_pinned<signed long, arithmetic_atomic_testers>();
   validate_pinned<signed long long, arithmetic_atomic_testers>();
 
-#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
   validate_pinned<unsigned char, bitwise_atomic_testers>();
   validate_pinned<unsigned short, bitwise_atomic_testers>();
-#endif
   validate_pinned<unsigned int, bitwise_atomic_testers>();
   validate_pinned<unsigned long, bitwise_atomic_testers>();
   validate_pinned<unsigned long long, bitwise_atomic_testers>();

From f93e690c7ba7904f280599aea92702b9f9546918 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 7 Aug 2024 17:07:04 -0700
Subject: [PATCH 04/29] Remove 16b cas and use only 32b cas.

---
 .../codegen/generators/compare_and_swap.h     |   1 -
 libcudacxx/codegen/generators/exchange.h      |   1 -
 libcudacxx/codegen/generators/ld_st.h         |   2 -
 .../std/__atomic/functions/cuda_ptx_derived.h | 319 +++------------
 .../__atomic/functions/cuda_ptx_generated.h   | 368 ------------------
 5 files changed, 57 insertions(+), 634 deletions(-)

diff --git a/libcudacxx/codegen/generators/compare_and_swap.h b/libcudacxx/codegen/generators/compare_and_swap.h
index 5a970735c0..17b11ab69c 100644
--- a/libcudacxx/codegen/generators/compare_and_swap.h
+++ b/libcudacxx/codegen/generators/compare_and_swap.h
@@ -83,7 +83,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
   };
 
   constexpr size_t supported_sizes[] = {
-    16,
     32,
     64,
     128,
diff --git a/libcudacxx/codegen/generators/exchange.h b/libcudacxx/codegen/generators/exchange.h
index dcfe66f147..f89a0075ae 100644
--- a/libcudacxx/codegen/generators/exchange.h
+++ b/libcudacxx/codegen/generators/exchange.h
@@ -81,7 +81,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(
   };
 
   constexpr size_t supported_sizes[] = {
-    16,
     32,
     64,
     128,
diff --git a/libcudacxx/codegen/generators/ld_st.h b/libcudacxx/codegen/generators/ld_st.h
index d4aec3da54..43cf80ef5c 100644
--- a/libcudacxx/codegen/generators/ld_st.h
+++ b/libcudacxx/codegen/generators/ld_st.h
@@ -92,7 +92,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_load(
 {{ asm volatile("ld{8}{4}{6}.{0}{1} %0,[%1];" : "={2}"(__dst) : "l"(__ptr) : "memory"); }})XXX";
 
   constexpr size_t supported_sizes[] = {
-    16,
     32,
     64,
     128,
@@ -249,7 +248,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_store(
 {{ asm volatile("st{8}{4}{6}.{0}{1} [%0],%1;" :: "l"(__ptr), "{2}"(__val) : "memory"); }})XXX";
 
   constexpr size_t supported_sizes[] = {
-    16,
     32,
     64,
     128,
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index d36f5a8777..710362864c 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -37,43 +37,46 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_CCCL_CUDA_COMPILER)
 
 template <class _Operand>
-using __atomic_cuda_enable_non_native_load = typename enable_if<_Operand::__size <= 8, bool>::type;
+using __atomic_cuda_enable_non_native_size = typename enable_if<_Operand::__size <= 16, bool>::type;
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_load<_Operand> = 0>
+template <class _Operand>
+using __atomic_cuda_enable_native_size = typename enable_if<_Operand::__size >= 32, bool>::type;
+
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
 static inline _CCCL_DEVICE void
 __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
-  uint16_t* __aligned     = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1));
-  const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8;
+  uint32_t* __aligned     = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
+  const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
 
-  uint16_t __value = 0;
+  uint32_t __value = 0;
 
-  __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}, __atomic_cuda_mmio_disable{});
+  __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});
   __dst = static_cast<_Type>(__value >> __offset);
 }
 
-template <class _Type, class _Order, class _Sco>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, __atomic_cuda_operand_b8, _Sco)
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+static inline _CCCL_DEVICE bool
+__cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco)
 {
-  uint16_t* __aligned     = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1));
-  const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8;
-  const uint16_t __mask   = ((1 << 8) - 1) << __offset;
+  uint32_t* __aligned     = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
+  const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
+  const uint32_t __mask   = ((1 << (sizeof(_Type) * 8)) - 1) << __offset;
 
-  // Algorithm for 8b CAS with 16b intrinsics
-  // __old = __window[0:16] where [__cmp] resides in either of the two 8b offsets
+  // Algorithm for 8b CAS with 32b intrinsics
+  // __old = __window[0:32] where [__cmp] resides within any of the potential offsets
   // First CAS attempt 'guesses' that the masked portion of the window is 0x00.
-  uint16_t __old       = (uint16_t(__op) << __offset);
-  uint16_t __old_value = 0;
+  uint32_t __old       = (uint32_t(__op) << __offset);
+  uint32_t __old_value = 0;
 
   // Reemit CAS instructions until either of two conditions are met
   while (1)
   {
     // Combine the desired value and most recently fetched expected masked portion of the window
-    uint16_t __attempt = (__old & ~__mask) | (uint16_t(__op) << __offset);
+    uint32_t __attempt = (__old & ~__mask) | (uint32_t(__op) << __offset);
 
     if (__cuda_atomic_compare_exchange(
-          __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}))
+          __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}))
     {
       // CAS was successful
       return true;
@@ -94,7 +97,7 @@ template <class _Type, class _Fn, class _Sco, class _Order, class _Operand>
 _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco)
 {
   _Type __expected = 0;
-  __cuda_atomic_load(__ptr, __expected, __atomic_cuda_relaxed{}, _Operand{}, _Sco{}, __atomic_cuda_mmio_disable{});
+  __atomic_load_cuda(__ptr, __expected, __ATOMIC_RELAXED, _Sco{});
   _Type __desired = __op(__expected);
   while (!__cuda_atomic_compare_exchange(__ptr, __expected, __expected, __desired, _Order{}, _Operand{}, _Sco{}))
   {
@@ -103,13 +106,7 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or
   return __expected;
 }
 
-template <class _Operand>
-using __atomic_cuda_enable_non_native_add = typename enable_if<_Operand::__size <= 16, bool>::type;
-
-template <class _Operand>
-using __atomic_cuda_enable_non_native_exchange = typename enable_if<_Operand::__size <= 8, bool>::type;
-
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_add<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -122,7 +119,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __d
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_add<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -135,7 +132,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __d
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_add<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -148,7 +145,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __d
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_add<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -161,7 +158,32 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __ds
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_exchange<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old < __op ? __old : __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old < __op ? __old : __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -174,7 +196,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __ds
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_exchange<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
 static inline _CCCL_DEVICE void
 __cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
@@ -254,7 +276,7 @@ _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __
 }
 
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
-_CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
   return __atomic_fetch_update_cuda(
     __ptr,
@@ -265,7 +287,7 @@ _CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder
     _Sco{});
 }
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
-_CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
   return __atomic_fetch_update_cuda(
     __ptr,
@@ -277,7 +299,7 @@ _CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int _
 }
 
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
-_CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
   return __atomic_fetch_update_cuda(
     __ptr,
@@ -288,7 +310,7 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorde
     _Sco{});
 }
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
-_CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
   return __atomic_fetch_update_cuda(
     __ptr,
@@ -299,233 +321,6 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int
     _Sco{});
 }
 
-// template <typename _Tp,
-//           typename _Sco,
-//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-// _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
-//   void volatile* __ptr,
-//   _Tp* __expected,
-//   const _Tp __desired,
-//   bool __weak,
-//   int __success_memorder,
-//   int __failure_memorder,
-//   _Sco)
-// {
-//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-//   __proxy_t __old = 0;
-//   __proxy_t __new = 0;
-//   memcpy(&__old, __expected, sizeof(__proxy_t));
-//   memcpy(&__new, &__desired, sizeof(__proxy_t));
-//   bool __result =
-//     __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
-//   memcpy(__expected, &__old, sizeof(__proxy_t));
-//   return __result;
-// }
-// template <typename _Tp,
-//           typename _Sco,
-//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-// _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
-//   void* __ptr, _Tp* __expected, const _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder,
-//   _Sco)
-// {
-//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-//   __proxy_t __old = 0;
-//   __proxy_t __new = 0;
-//   memcpy(&__old, __expected, sizeof(__proxy_t));
-//   memcpy(&__new, &__desired, sizeof(__proxy_t));
-//   bool __result =
-//     __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
-//   memcpy(__expected, &__old, sizeof(__proxy_t));
-//   return __result;
-// }
-// template <typename _Tp,
-//           typename _Sco,
-//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-// _CCCL_DEVICE void __atomic_exchange_cuda(void volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
-// {
-//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-//   __proxy_t __old = 0;
-//   __proxy_t __new = 0;
-//   memcpy(&__new, __val, sizeof(__proxy_t));
-//   __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
-//   memcpy(__ret, &__old, sizeof(__proxy_t));
-// }
-// template <typename _Tp,
-//           typename _Sco,
-//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-// _CCCL_DEVICE void __atomic_exchange_cuda(void* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
-// {
-//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-//   __proxy_t __old = 0;
-//   __proxy_t __new = 0;
-//   memcpy(&__new, __val, sizeof(__proxy_t));
-//   __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
-//   memcpy(__ret, &__old, sizeof(__proxy_t));
-// }
-
-// template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{}))
-//     ;
-//   *__ret = __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected + __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected + __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp,
-//           typename _Up,
-//           typename _Sco,
-//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected > __val ? __expected : __val;
-
-//   while (__desired == __val
-//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected > __val ? __expected : __val;
-//   }
-
-//   return __expected;
-// }
-// template <typename _Tp,
-//           typename _Up,
-//           typename _Sco,
-//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected > __val ? __expected : __val;
-
-//   while (__desired == __val
-//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected > __val ? __expected : __val;
-//   }
-
-//   return __expected;
-// }
-
-// template <typename _Tp,
-//           typename _Up,
-//           typename _Sco,
-//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected < __val ? __expected : __val;
-
-//   while (__desired == __val
-//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected < __val ? __expected : __val;
-//   }
-
-//   return __expected;
-// }
-// template <typename _Tp,
-//           typename _Up,
-//           typename _Sco,
-//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected < __val ? __expected : __val;
-
-//   while (__desired == __val
-//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected < __val ? __expected : __val;
-//   }
-
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected - __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected - __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected & __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected & __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected ^ __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected ^ __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected | __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected | __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Sco>
-// _CCCL_DEVICE bool __atomic_compare_exchange_n_cuda(
-//   _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder,
-//   _Sco)
-// {
-//   return __atomic_compare_exchange_cuda(
-//     __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{});
-// }
-
-// template <typename _Tp, typename _Sco>
-// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
-// {
-//   _Tp __ret;
-//   __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
-//   return __ret;
-// }
-// template <typename _Tp, typename _Sco>
-// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco)
-// {
-//   _Tp __ret;
-//   __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
-//   return __ret;
-// }
-
 _CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int)
 {
   asm volatile("" ::: "memory");
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index e72144b68c..3b164cfc52 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -116,162 +116,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_load_memory_order_dispatch(_Fn &__
   )
 }
 
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
-{ asm volatile("ld.mmio.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
-{ asm volatile("ld.mmio.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.acquire.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
-{ asm volatile("ld.mmio.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_load(
-  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
 template <class _Type>
 static inline _CCCL_DEVICE void __cuda_atomic_load(
   const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
@@ -872,58 +716,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_store_memory_order_dispatch(_Fn &_
   )
 }
 
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.release.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.release.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.release.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.release.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.relaxed.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.relaxed.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.relaxed.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
-{ asm volatile("st.mmio.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_store(
-  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
-{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
 template <class _Type>
 static inline _CCCL_DEVICE void __cuda_atomic_store(
   _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
@@ -1220,86 +1012,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_swap_memory_order_dispatch
   return __res;
 }
 
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.acquire.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.acquire.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.acquire.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.acquire.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.relaxed.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.relaxed.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.relaxed.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.relaxed.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.release.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.release.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.release.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.release.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.acq_rel.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.acq_rel.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.acq_rel.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.acq_rel.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
 template <class _Type>
 static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
   _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag)
@@ -1763,86 +1475,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange_memory_order_dispatch(_Fn
   )
 }
 
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.acquire.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.acquire.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.acquire.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.acquire.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.relaxed.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.relaxed.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.relaxed.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.relaxed.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.release.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.release.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.release.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.release.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.acq_rel.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.acq_rel.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.acq_rel.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.acq_rel.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
 template <class _Type>
 static inline _CCCL_DEVICE void __cuda_atomic_exchange(
   _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag)

From ae24c8ecba44dee1a3794763af06d48e8e6fb867 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 8 Aug 2024 14:48:36 -0700
Subject: [PATCH 05/29] Get several tests passing for 8/16b atomics

---
 .../std/__atomic/functions/cuda_ptx_derived.h | 40 +++++++++----------
 .../functions/cuda_ptx_generated_helper.h     | 12 +-----
 2 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 710362864c..accbe65243 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -13,11 +13,6 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cstdint>
-
-#include "cuda/std/__atomic/functions/cuda_ptx_generated_helper.h"
-#include "cuda_ptx_generated.h"
-
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -37,17 +32,18 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_CCCL_CUDA_COMPILER)
 
 template <class _Operand>
-using __atomic_cuda_enable_non_native_size = typename enable_if<_Operand::__size <= 16, bool>::type;
+using __cuda_atomic_enable_non_native_arithmetic =
+  typename enable_if<_Operand::__size <= 16 || _Operand::__op == __atomic_cuda_operand::_f, bool>::type;
 
 template <class _Operand>
-using __atomic_cuda_enable_native_size = typename enable_if<_Operand::__size >= 32, bool>::type;
+using __cuda_atomic_enable_non_native_bitwise = typename enable_if<_Operand::__size <= 16, bool>::type;
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
 static inline _CCCL_DEVICE void
 __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
   uint32_t* __aligned     = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
-  const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
+  const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * (sizeof(_Type) * 8);
 
   uint32_t __value = 0;
 
@@ -55,7 +51,7 @@ __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __a
   __dst = static_cast<_Type>(__value >> __offset);
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
 static inline _CCCL_DEVICE bool
 __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco)
 {
@@ -66,8 +62,8 @@ __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __
   // Algorithm for 8b CAS with 32b intrinsics
   // __old = __window[0:32] where [__cmp] resides within any of the potential offsets
   // First CAS attempt 'guesses' that the masked portion of the window is 0x00.
-  uint32_t __old       = (uint32_t(__op) << __offset);
-  uint32_t __old_value = 0;
+  uint32_t __old    = (uint32_t(__op) << __offset);
+  _Type __old_value = 0;
 
   // Reemit CAS instructions until either of two conditions are met
   while (1)
@@ -81,7 +77,7 @@ __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __
       // CAS was successful
       return true;
     }
-    __old_value = (__old & __mask) >> __offset;
+    __old_value = static_cast<_Type>((__old & __mask) >> __offset);
     // The expected value no longer matches inside the CAS.
     if (__old_value != __cmp)
     {
@@ -106,7 +102,7 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or
   return __expected;
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -119,7 +115,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __d
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -132,7 +128,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __d
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -145,7 +141,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __d
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -158,7 +154,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __ds
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -170,20 +166,20 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __d
     __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
     _Sco{});
 }
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
     __ptr,
     [__op](_Type __old) {
-      return __old < __op ? __old : __op;
+      return __old > __op ? __old : __op;
     },
     _Order{},
     __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
   __dst = __cuda_atomic_fetch_update(
@@ -196,7 +192,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __ds
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __atomic_cuda_enable_non_native_size<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
 static inline _CCCL_DEVICE void
 __cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
index cc3f3f4659..f806d3d0d8 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
@@ -144,14 +144,7 @@ using __atomic_cuda_deduce_arithmetic =
                       __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>>;
 
 template <class _Type>
-using __atomic_cuda_deduce_minmax =
-  _If<_CCCL_TRAIT(is_signed, _Type),
-      _If<sizeof(_Type) == 4,
-          __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
-          __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_s64>>,
-      _If<sizeof(_Type) == 4,
-          __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
-          __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>;
+using __atomic_cuda_deduce_minmax = __atomic_cuda_deduce_arithmetic<_Type>;
 
 template <class _Type>
 using __atomic_enable_if_native_bitwise = bool;
@@ -159,9 +152,6 @@ using __atomic_enable_if_native_bitwise = bool;
 template <class _Type>
 using __atomic_enable_if_native_arithmetic = typename enable_if<_CCCL_TRAIT(is_scalar, _Type), bool>::type;
 
-template <class _Type>
-using __atomic_enable_if_not_native_arithmetic = typename enable_if<!_CCCL_TRAIT(is_scalar, _Type), bool>::type;
-
 template <class _Type>
 using __atomic_enable_if_native_minmax = typename enable_if<_CCCL_TRAIT(is_integral, _Type), bool>::type;
 

From 1af8667d5e018d5f562edca26954f2c46cfc6155 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 16 Aug 2024 15:32:05 -0700
Subject: [PATCH 06/29] Remove todo and ifdefs from tests covering 8b/16b
 atomics

---
 .../libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp  | 1 -
 .../libcudacxx/heterogeneous/atomic/reference_std.pass.cpp   | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp
index fbe613fe44..4cae377b27 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp
@@ -175,7 +175,6 @@ using bitwise_atomic_testers =
 
 void kernel_invoker()
 {
-  // todo
   validate_pinned<signed char, arithmetic_atomic_testers>();
   validate_pinned<signed short, arithmetic_atomic_testers>();
   validate_pinned<signed int, arithmetic_atomic_testers>();
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp
index 9da187cee7..366b93b82f 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp
@@ -167,19 +167,14 @@ using bitwise_atomic_testers =
 
 void kernel_invoker()
 {
-// todo
-#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
   validate_pinned<signed char, arithmetic_atomic_testers>();
   validate_pinned<signed short, arithmetic_atomic_testers>();
-#endif
   validate_pinned<signed int, arithmetic_atomic_testers>();
   validate_pinned<signed long, arithmetic_atomic_testers>();
   validate_pinned<signed long long, arithmetic_atomic_testers>();
 
-#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
   validate_pinned<unsigned char, bitwise_atomic_testers>();
   validate_pinned<unsigned short, bitwise_atomic_testers>();
-#endif
   validate_pinned<unsigned int, bitwise_atomic_testers>();
   validate_pinned<unsigned long, bitwise_atomic_testers>();
   validate_pinned<unsigned long long, bitwise_atomic_testers>();

From ff06fa141909a65304818cb94eef0101dc57b128 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 16 Aug 2024 15:33:22 -0700
Subject: [PATCH 07/29] Fix bug in 16b atomic load

---
 .../include/cuda/std/__atomic/functions/cuda_ptx_derived.h      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index accbe65243..3c7b297fea 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -43,7 +43,7 @@ static inline _CCCL_DEVICE void
 __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
   uint32_t* __aligned     = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
-  const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * (sizeof(_Type) * 8);
+  const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
 
   uint32_t __value = 0;
 

From 8daaad13d3d3f3d5abf0b8ba3e85334b7f14d2ae Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 16 Aug 2024 15:34:31 -0700
Subject: [PATCH 08/29] Move store close to fetch_update since it is a derived
 primitive

---
 .../std/__atomic/functions/cuda_ptx_derived.h | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 3c7b297fea..f3009fefb6 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -102,6 +102,22 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or
   return __expected;
 }
 
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+static inline _CCCL_DEVICE void
+__cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
+{
+  printf("%s\r\n", __PRETTY_FUNCTION__);
+  // Store requires cas on 8/16b types
+  __cuda_atomic_fetch_update(
+    __ptr,
+    [__val](_Type __old) {
+      return __val;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
 template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
@@ -157,6 +173,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __ds
 template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
+  printf("%s\r\n", __PRETTY_FUNCTION__);
   __dst = __cuda_atomic_fetch_update(
     __ptr,
     [__op](_Type __old) {
@@ -192,21 +209,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __ds
     _Sco{});
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
-static inline _CCCL_DEVICE void
-__cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
-{
-  // Store requires cas on 8b types
-  __cuda_atomic_fetch_update(
-    __ptr,
-    [__val](_Type __old) {
-      return __val;
-    },
-    _Order{},
-    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
-    _Sco{});
-}
-
 template <typename _Tp, typename _Fn, typename _Sco>
 _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco)
 {
@@ -274,6 +276,7 @@ _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
 _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
+  printf("%s\r\n", __PRETTY_FUNCTION__);
   return __atomic_fetch_update_cuda(
     __ptr,
     [__val](_Tp __old) {
@@ -285,6 +288,7 @@ _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder,
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
 _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
+  printf("%s\r\n", __PRETTY_FUNCTION__);
   return __atomic_fetch_update_cuda(
     __ptr,
     [__val](_Tp __old) {

From f588332a782c1f3c111d6722eb2d6afdc4440d52 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 16 Aug 2024 15:35:35 -0700
Subject: [PATCH 09/29] Fix bug in minmax due to s64 overload missing for
 arithmetic types

---
 .../functions/cuda_ptx_generated_helper.h     | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
index f806d3d0d8..9b81adff18 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
@@ -144,7 +144,27 @@ using __atomic_cuda_deduce_arithmetic =
                       __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>>;
 
 template <class _Type>
-using __atomic_cuda_deduce_minmax = __atomic_cuda_deduce_arithmetic<_Type>;
+using __atomic_cuda_deduce_minmax =
+  _If<_CCCL_TRAIT(is_floating_point, _Type),
+      _If<sizeof(_Type) == 4,
+          __atomic_cuda_operand_deduction<float, __atomic_cuda_operand_f32>,
+          __atomic_cuda_operand_deduction<double, __atomic_cuda_operand_f64>>,
+      _If<_CCCL_TRAIT(is_signed, _Type),
+          _If<sizeof(_Type) == 1,
+              __atomic_cuda_operand_deduction<int8_t, __atomic_cuda_operand_s8>,
+              _If<sizeof(_Type) == 2,
+                  __atomic_cuda_operand_deduction<int16_t, __atomic_cuda_operand_s16>,
+                  _If<sizeof(_Type) == 4,
+                      __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
+                      __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_s64>>>>, // atom.min|max.s64
+                                                                                              // supported
+          _If<sizeof(_Type) == 1,
+              __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_u8>,
+              _If<sizeof(_Type) == 2,
+                  __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_u16>,
+                  _If<sizeof(_Type) == 4,
+                      __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
+                      __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>>;
 
 template <class _Type>
 using __atomic_enable_if_native_bitwise = bool;

From 60e8c256a1b2b63b6b6511866899fdc8d9bd9847 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 16 Aug 2024 15:36:53 -0700
Subject: [PATCH 10/29] Add more 8/16b tests for atomic_ref

---
 .../std/atomics/atomics.types.generic/integral_ref.pass.cpp   | 4 ++++
 .../atomics.types.generic/integral_ref_constness.pass.cpp     | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
index 56153f3664..5ef493b6dd 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
@@ -177,6 +177,10 @@ __host__ __device__ void test_for_all_types()
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
   test<Atomic<char32_t, Scope>, char32_t, Selector>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
+  test<Atomic<int8_t, Scope>, int8_t, Selector>();
+  test<Atomic<uint8_t, Scope>, uint8_t, Selector>();
+  test<Atomic<int16_t, Scope>, int16_t, Selector>();
+  test<Atomic<uint16_t, Scope>, uint16_t, Selector>();
   test<Atomic<int32_t, Scope>, int32_t, Selector>();
   test<Atomic<uint32_t, Scope>, uint32_t, Selector>();
   test<Atomic<int64_t, Scope>, int64_t, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
index b237c862a5..57c4a10f53 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
@@ -177,6 +177,10 @@ __host__ __device__ void test_for_all_types()
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
   test<Atomic<char32_t, Scope>, char32_t, Selector>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
+  test<Atomic<int8_t, Scope>, int8_t, Selector>();
+  test<Atomic<uint8_t, Scope>, uint8_t, Selector>();
+  test<Atomic<int16_t, Scope>, int16_t, Selector>();
+  test<Atomic<uint16_t, Scope>, uint16_t, Selector>();
   test<Atomic<int32_t, Scope>, int32_t, Selector>();
   test<Atomic<uint32_t, Scope>, uint32_t, Selector>();
   test<Atomic<int64_t, Scope>, int64_t, Selector>();

From e9a79f4d84a6350179fbfe87188ce2cb3db131ed Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 16 Aug 2024 16:34:52 -0700
Subject: [PATCH 11/29] Fixup remove debug prints

---
 .../include/cuda/std/__atomic/functions/cuda_ptx_derived.h    | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index f3009fefb6..cc795e9495 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -106,7 +106,6 @@ template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_e
 static inline _CCCL_DEVICE void
 __cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
-  printf("%s\r\n", __PRETTY_FUNCTION__);
   // Store requires cas on 8/16b types
   __cuda_atomic_fetch_update(
     __ptr,
@@ -173,7 +172,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __ds
 template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
 static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
 {
-  printf("%s\r\n", __PRETTY_FUNCTION__);
   __dst = __cuda_atomic_fetch_update(
     __ptr,
     [__op](_Type __old) {
@@ -276,7 +274,6 @@ _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
 _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
-  printf("%s\r\n", __PRETTY_FUNCTION__);
   return __atomic_fetch_update_cuda(
     __ptr,
     [__val](_Tp __old) {
@@ -288,7 +285,6 @@ _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder,
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
 _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
-  printf("%s\r\n", __PRETTY_FUNCTION__);
   return __atomic_fetch_update_cuda(
     __ptr,
     [__val](_Tp __old) {

From b713109116180fda73b339c1d0fae398f3bf7aca Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 16 Aug 2024 17:08:40 -0700
Subject: [PATCH 12/29] Cleanup bitmask hell, fix bug where lower mask was
 ignored

---
 .../std/__atomic/functions/cuda_ptx_derived.h     | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index cc795e9495..c62fede58e 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -55,21 +55,22 @@ template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_e
 static inline _CCCL_DEVICE bool
 __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco)
 {
-  uint32_t* __aligned     = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
-  const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
-  const uint32_t __mask   = ((1 << (sizeof(_Type) * 8)) - 1) << __offset;
+  uint32_t* __aligned           = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
+  constexpr uint32_t __sizemask = (1 << (sizeof(_Type) * 8)) - 1;
+  const uint32_t __offset       = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
+  const uint32_t __valueMask    = __sizemask << __offset;
+  const uint32_t __windowMask   = ~__valueMask;
 
   // Algorithm for 8b CAS with 32b intrinsics
   // __old = __window[0:32] where [__cmp] resides within any of the potential offsets
   // First CAS attempt 'guesses' that the masked portion of the window is 0x00.
-  uint32_t __old    = (uint32_t(__op) << __offset);
-  _Type __old_value = 0;
+  uint32_t __old = (uint32_t(__cmp) << __offset);
 
   // Reemit CAS instructions until either of two conditions are met
   while (1)
   {
     // Combine the desired value and most recently fetched expected masked portion of the window
-    uint32_t __attempt = (__old & ~__mask) | (uint32_t(__op) << __offset);
+    uint32_t __attempt = (__old & __windowMask) | (uint32_t(__op) << __offset);
 
     if (__cuda_atomic_compare_exchange(
           __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}))
@@ -77,7 +78,7 @@ __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __
       // CAS was successful
       return true;
     }
-    __old_value = static_cast<_Type>((__old & __mask) >> __offset);
+    auto __old_value = static_cast<_Type>((__old & __valueMask) >> __offset);
     // The expected value no longer matches inside the CAS.
     if (__old_value != __cmp)
     {

From 1c2627d57643f15a6aca66a99ec7521957d02579 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 16 Aug 2024 17:09:30 -0700
Subject: [PATCH 13/29] Add test covering interleaved CAS onto same atomic
 window

---
 .../cuda/atomics/atomic_ref_small.pass.cpp    | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp

diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
new file mode 100644
index 0000000000..d012c2f02b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
+// UNSUPPORTED: windows && pre-sm-70
+
+// <cuda/atomic>
+
+#include <cuda/atomic>
+#include <cuda/std/cassert>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+/*
+Test goals:
+Interleaved 8b/16b access to a 32b window while there is thread contention.
+
+for 8b:
+Launch 1028 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption
+caused by interleaved access to different parts of the window.
+
+for 16b:
+Launch 1028 threads, fetch_add(128) into both windows.
+*/
+
+template <class T, int Inc>
+__host__ __device__ void fetch_add_into_window(T* window, uint16_t* atomHistory)
+{
+  typedef cuda::atomic_ref<T, cuda::thread_scope_block> Atom;
+
+  Atom a(*window);
+  *atomHistory = a.fetch_add(Inc);
+}
+
+template <class T, int Inc>
+__device__ void device_do_test(uint32_t expected)
+{
+  __shared__ uint16_t atomHistory[1024];
+  __shared__ uint32_t atomicStorage;
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_block> bucket(atomicStorage);
+
+  constexpr uint32_t offsetMask = ((4 / sizeof(T)) - 1);
+  // Access offset is interleaved meaning threads 4, 5, 6, 7 access window 0, 1, 2, 3 and so on.
+  const uint32_t threadOffset = threadIdx.x & offsetMask;
+
+  if (threadIdx.x == 0)
+  {
+    bucket.store(0);
+  }
+  __syncthreads();
+
+  T* window = reinterpret_cast<T*>(&atomicStorage) + threadOffset;
+  fetch_add_into_window<T, Inc>(window, atomHistory + threadIdx.x);
+
+  __syncthreads();
+  if (threadIdx.x == 0)
+  {
+    printf("expected: 0x%X\r\n", expected);
+    printf("result:   0x%X\r\n", bucket.load());
+    assert(bucket.load() == expected);
+  }
+}
+
+int main(int, char**)
+{
+  NV_DISPATCH_TARGET(NV_IS_HOST,
+                     (cuda_thread_count = 1020;),
+                     NV_IS_DEVICE,
+                     (device_do_test<uint8_t, 1>(~uint32_t(0)); device_do_test<uint16_t, 128>(0xFF00FF00);));
+
+  return 0;
+}

From c0f52c8afe53c59ddb6e05d57ed8f767cbd9fea3 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Fri, 16 Aug 2024 18:06:32 -0700
Subject: [PATCH 14/29] Fixup documentation mistake.

---
 .../test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
index d012c2f02b..448215b3a6 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
@@ -22,11 +22,11 @@ Test goals:
 Interleaved 8b/16b access to a 32b window while there is thread contention.
 
 for 8b:
-Launch 1028 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption
+Launch 1020 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption
 caused by interleaved access to different parts of the window.
 
 for 16b:
-Launch 1028 threads, fetch_add(128) into both windows.
+Launch 1020 threads, fetch_add(128) into both windows.
 */
 
 template <class T, int Inc>

From 6d1beecaece3eee3968fb84f3f1f2a9d98c05c22 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Mon, 19 Aug 2024 11:15:05 -0700
Subject: [PATCH 15/29] Make atomics enable_if uses match rest of libcudacxx.

---
 .../std/__atomic/functions/cuda_ptx_derived.h |  4 +-
 .../functions/cuda_ptx_generated_helper.h     |  6 +-
 .../std/__atomic/platform/msvc_to_builtins.h  | 66 +++++++++----------
 3 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index c62fede58e..0b9b3860a6 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -33,10 +33,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Operand>
 using __cuda_atomic_enable_non_native_arithmetic =
-  typename enable_if<_Operand::__size <= 16 || _Operand::__op == __atomic_cuda_operand::_f, bool>::type;
+  __enable_if_t<_Operand::__size <= 16 || _Operand::__op == __atomic_cuda_operand::_f, bool>;
 
 template <class _Operand>
-using __cuda_atomic_enable_non_native_bitwise = typename enable_if<_Operand::__size <= 16, bool>::type;
+using __cuda_atomic_enable_non_native_bitwise = __enable_if_t<_Operand::__size <= 16, bool>;
 
 template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
 static inline _CCCL_DEVICE void
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
index 9b81adff18..9a4c2d5bfc 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
@@ -170,13 +170,13 @@ template <class _Type>
 using __atomic_enable_if_native_bitwise = bool;
 
 template <class _Type>
-using __atomic_enable_if_native_arithmetic = typename enable_if<_CCCL_TRAIT(is_scalar, _Type), bool>::type;
+using __atomic_enable_if_native_arithmetic = __enable_if_t<_CCCL_TRAIT(is_scalar, _Type), bool>;
 
 template <class _Type>
-using __atomic_enable_if_native_minmax = typename enable_if<_CCCL_TRAIT(is_integral, _Type), bool>::type;
+using __atomic_enable_if_native_minmax = __enable_if_t<_CCCL_TRAIT(is_integral, _Type), bool>;
 
 template <class _Type>
-using __atomic_enable_if_not_native_minmax = typename enable_if<!_CCCL_TRAIT(is_integral, _Type), bool>::type;
+using __atomic_enable_if_not_native_minmax = __enable_if_t<!_CCCL_TRAIT(is_integral, _Type), bool>;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index 8afa9756ef..33d11fd9ad 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -72,9 +72,9 @@ static inline void __atomic_thread_fence(int __memorder)
 }
 
 template <typename _Type, size_t _Size>
-using _enable_if_sized_as = typename enable_if<sizeof(_Type) == _Size, int>::type;
+using __enable_if_sized_as = __enable_if_t<sizeof(_Type) == _Size, int>;
 
-template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
 #  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
@@ -84,7 +84,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 #  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
-template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
 #  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
@@ -94,7 +94,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 #  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
-template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
 #  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
@@ -104,7 +104,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 #  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
-template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
 #  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
@@ -136,7 +136,7 @@ void __atomic_load(const volatile _Type* __ptr, _Type* __ret, int __memorder)
   }
 }
 
-template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int8*>(__val);
@@ -147,7 +147,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
   __iso_volatile_store8(__d, *__t);
 #  endif
 }
-template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int16*>(__val);
@@ -158,7 +158,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
   __iso_volatile_store16(__d, *__t);
 #  endif
 }
-template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int32*>(__val);
@@ -170,7 +170,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
   __iso_volatile_store32(__d, *__t);
 #  endif
 }
-template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int64*>(__val);
@@ -202,7 +202,7 @@ void __atomic_store(volatile _Type* __ptr, _Type* __val, int __memorder)
   }
 }
 
-template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 1> = 0>
 bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired)
 {
   auto __tmp_desired  = reinterpret_cast<const char&>(*__desired);
@@ -215,7 +215,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp
   *__expected = reinterpret_cast<const _Type&>(__old);
   return false;
 }
-template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 2> = 0>
 bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired)
 {
   auto __tmp_desired  = reinterpret_cast<const short&>(*__desired);
@@ -228,7 +228,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp
   *__expected = reinterpret_cast<const _Type&>(__old);
   return false;
 }
-template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 4> = 0>
 bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired)
 {
   auto __tmp_desired  = reinterpret_cast<const long&>(*__desired);
@@ -241,7 +241,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp
   *__expected = reinterpret_cast<const _Type&>(__old);
   return false;
 }
-template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 8> = 0>
 bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired)
 {
   auto __tmp_desired  = reinterpret_cast<const __int64&>(*__desired);
@@ -287,25 +287,25 @@ bool __atomic_compare_exchange(
   return success;
 }
 
-template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchange8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchange16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchange((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchange64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));
@@ -341,25 +341,25 @@ void __atomic_exchange(_Type volatile* __ptr, const _Type* __val, _Type* __ret,
   }
 }
 
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchangeAdd8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchangeAdd16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchangeAdd((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchangeAdd64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));
@@ -404,25 +404,25 @@ _Type __atomic_fetch_sub(_Type volatile* __ptr, _Delta __val, int __memorder)
   return __atomic_fetch_add(__ptr, 0 - __val, __memorder);
 }
 
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedAnd8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedAnd16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedAnd((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedAnd64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));
@@ -462,25 +462,25 @@ _Type __atomic_fetch_and(_Type volatile* __ptr, _Delta __val, int __memorder)
   return *__dest;
 }
 
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedXor8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedXor16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedXor((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedXor64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));
@@ -520,25 +520,25 @@ _Type __atomic_fetch_xor(_Type volatile* __ptr, _Delta __val, int __memorder)
   return *__dest;
 }
 
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedOr8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedOr16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedOr((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedOr64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));

From d5f89285e131093c2089d4b59a15219cf167d715 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Mon, 19 Aug 2024 15:32:42 -0700
Subject: [PATCH 16/29] Verify fetch_add sequential load behavior in 8b/16b
 atomics

---
 .../cuda/atomics/atomic_ref_small.pass.cpp    | 32 +++++++++++++++----
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
index 448215b3a6..e1f62d856f 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
@@ -22,11 +22,11 @@ Test goals:
 Interleaved 8b/16b access to a 32b window while there is thread contention.
 
 for 8b:
-Launch 1020 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption
+Launch 1024 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption
 caused by interleaved access to different parts of the window.
 
 for 16b:
-Launch 1020 threads, fetch_add(128) into both windows.
+Launch 1024 threads, fetch_add(1), checking for 0x01FF01FF.
 */
 
 template <class T, int Inc>
@@ -38,11 +38,17 @@ __host__ __device__ void fetch_add_into_window(T* window, uint16_t* atomHistory)
   *atomHistory = a.fetch_add(Inc);
 }
 
-template <class T, int Inc>
+template <class T>
 __device__ void device_do_test(uint32_t expected)
 {
-  __shared__ uint16_t atomHistory[1024];
+  constexpr uint32_t threadCount               = 1024;
+  constexpr uint32_t histogramResultCount      = 256 * sizeof(T);
+  constexpr uint32_t histogramEntriesPerThread = 4 / sizeof(T);
+
+  __shared__ uint16_t atomHistory[threadCount];
+  __shared__ uint8_t atomHistogram[histogramResultCount];
   __shared__ uint32_t atomicStorage;
+
   cuda::atomic_ref<uint32_t, cuda::thread_scope_block> bucket(atomicStorage);
 
   constexpr uint32_t offsetMask = ((4 / sizeof(T)) - 1);
@@ -51,16 +57,28 @@ __device__ void device_do_test(uint32_t expected)
 
   if (threadIdx.x == 0)
   {
+    memset(atomHistogram, 0, histogramResultCount);
     bucket.store(0);
   }
   __syncthreads();
 
   T* window = reinterpret_cast<T*>(&atomicStorage) + threadOffset;
-  fetch_add_into_window<T, Inc>(window, atomHistory + threadIdx.x);
+  fetch_add_into_window<T, 1>(window, atomHistory + threadIdx.x);
 
   __syncthreads();
   if (threadIdx.x == 0)
   {
+    // For each thread, add its atomic result into the corresponding bucket
+    for (int i = 0; i < threadCount; i++)
+    {
+      atomHistogram[atomHistory[i]]++;
+    }
+    // Check that each bucket has exactly (4 / sizeof(T)) entries
+    // This checks that atomic fetch operations were sequential. i.e. 4xfetch_add(1) returns [0, 1, 2, 3]
+    for (int i = 0; i < histogramResultCount; i++)
+    {
+      assert(atomHistogram[i] == histogramEntriesPerThread);
+    }
     printf("expected: 0x%X\r\n", expected);
     printf("result:   0x%X\r\n", bucket.load());
     assert(bucket.load() == expected);
@@ -70,9 +88,9 @@ __device__ void device_do_test(uint32_t expected)
 int main(int, char**)
 {
   NV_DISPATCH_TARGET(NV_IS_HOST,
-                     (cuda_thread_count = 1020;),
+                     (cuda_thread_count = 1024;),
                      NV_IS_DEVICE,
-                     (device_do_test<uint8_t, 1>(~uint32_t(0)); device_do_test<uint16_t, 128>(0xFF00FF00);));
+                     (device_do_test<uint8_t>(0); device_do_test<uint16_t>(0x02000200);));
 
   return 0;
 }

From c9ca506b2747f1d94129fdd94ab876e4794947ce Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Tue, 20 Aug 2024 12:59:09 -0700
Subject: [PATCH 17/29] Remove 8b/16b add PTX tests

---
 .../test/atomic_codegen/atomic_add_16b.cu     | 23 -------------------
 .../test/atomic_codegen/atomic_add_8b.cu      | 23 -------------------
 2 files changed, 46 deletions(-)
 delete mode 100644 libcudacxx/test/atomic_codegen/atomic_add_16b.cu
 delete mode 100644 libcudacxx/test/atomic_codegen/atomic_add_8b.cu

diff --git a/libcudacxx/test/atomic_codegen/atomic_add_16b.cu b/libcudacxx/test/atomic_codegen/atomic_add_16b.cu
deleted file mode 100644
index 95bc80e9a4..0000000000
--- a/libcudacxx/test/atomic_codegen/atomic_add_16b.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <cuda/atomic>
-
-__global__ void add_relaxed_device_non_volatile(uint16_t* data, uint16_t* out, uint16_t n)
-{
-  auto ref = cuda::atomic_ref<uint16_t, cuda::thread_scope_device>{*(data)};
-  *out     = ref.fetch_add(n, cuda::std::memory_order_relaxed);
-}
-
-/*
-
-; SM8X-LABEL: .target sm_80
-; SM8X:      .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
-; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
-; SM8X-NEXT: //
-; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
-; SM8X-NEXT: ret;
-
-*/
diff --git a/libcudacxx/test/atomic_codegen/atomic_add_8b.cu b/libcudacxx/test/atomic_codegen/atomic_add_8b.cu
deleted file mode 100644
index 8ff6f624e3..0000000000
--- a/libcudacxx/test/atomic_codegen/atomic_add_8b.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <cuda/atomic>
-
-__global__ void add_relaxed_device_non_volatile(uint8_t* data, uint8_t* out, uint8_t n)
-{
-  auto ref = cuda::atomic_ref<uint8_t, cuda::thread_scope_device>{*(data)};
-  *out     = ref.fetch_add(n, cuda::std::memory_order_relaxed);
-}
-
-/*
-
-; SM8X-LABEL: .target sm_80
-; SM8X:      .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
-; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
-; SM8X-NEXT: //
-; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
-; SM8X-NEXT: ret;
-
-*/

From 6f5d0b8b49f95d194b3f3f08ac3ce09489c11d2d Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Tue, 20 Aug 2024 18:10:36 -0700
Subject: [PATCH 18/29] Optimize fetch_update CAS loops

---
 .../std/__atomic/functions/cuda_ptx_derived.h | 107 ++++++++++++++----
 1 file changed, 84 insertions(+), 23 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 0b9b3860a6..06e965d4a2 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -38,12 +38,16 @@ using __cuda_atomic_enable_non_native_arithmetic =
 template <class _Operand>
 using __cuda_atomic_enable_non_native_bitwise = __enable_if_t<_Operand::__size <= 16, bool>;
 
+template <class _Operand>
+using __cuda_atomic_enable_native_bitwise = __enable_if_t<_Operand::__size >= 32, bool>;
+
 template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
 static inline _CCCL_DEVICE void
 __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
-  uint32_t* __aligned     = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
-  const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
+  constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
+  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint32_t((intptr_t) __ptr & __alignmask) * 8;
 
   uint32_t __value = 0;
 
@@ -55,22 +59,30 @@ template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_e
 static inline _CCCL_DEVICE bool
 __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco)
 {
-  uint32_t* __aligned           = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
-  constexpr uint32_t __sizemask = (1 << (sizeof(_Type) * 8)) - 1;
-  const uint32_t __offset       = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
-  const uint32_t __valueMask    = __sizemask << __offset;
-  const uint32_t __windowMask   = ~__valueMask;
+  constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
+  constexpr uint32_t __sizemask  = (1 << (sizeof(_Type) * 8)) - 1;
+  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint32_t((intptr_t) __ptr & __alignmask) * 8;
+  const uint32_t __valueMask     = __sizemask << __offset;
+  const uint32_t __windowMask    = ~__valueMask;
+  const uint32_t __cmpOffset     = __cmp << __offset;
+  const uint32_t __opOffset      = __op << __offset;
 
   // Algorithm for 8b CAS with 32b intrinsics
-  // __old = __window[0:32] where [__cmp] resides within any of the potential offsets
-  // First CAS attempt 'guesses' that the masked portion of the window is 0x00.
-  uint32_t __old = (uint32_t(__cmp) << __offset);
-
-  // Reemit CAS instructions until either of two conditions are met
-  while (1)
+  // __old = __window[0:32] where [__cmp] resides within some offset.
+  uint32_t __old;
+  // Start by loading __old with the current value, this optimizes for early return when __cmp is wrong
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});),
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});))
+  // Reemit CAS instructions until we succeed or the old value is a mismatch
+  while (__cmpOffset == (__old & __valueMask))
   {
     // Combine the desired value and most recently fetched expected masked portion of the window
-    uint32_t __attempt = (__old & __windowMask) | (uint32_t(__op) << __offset);
+    const uint32_t __attempt = (__old & __windowMask) | __opOffset;
 
     if (__cuda_atomic_compare_exchange(
           __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}))
@@ -78,23 +90,72 @@ __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __
       // CAS was successful
       return true;
     }
-    auto __old_value = static_cast<_Type>((__old & __valueMask) >> __offset);
-    // The expected value no longer matches inside the CAS.
-    if (__old_value != __cmp)
+  }
+  __dst = static_cast<_Type>(__old >> __offset);
+  return false;
+}
+
+// Optimized fetch_update CAS loop with op determined after first load reducing waste.
+template <class _Type,
+          class _Fn,
+          class _Order,
+          class _Operand,
+          class _Sco,
+          __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+_CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco)
+{
+  constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
+  constexpr uint32_t __sizemask  = (1 << (sizeof(_Type) * 8)) - 1;
+  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint8_t((intptr_t) __ptr & __alignmask) * 8;
+  const uint32_t __valueMask     = __sizemask << __offset;
+  const uint32_t __windowMask    = ~__valueMask;
+
+  // 8/16b fetch update is similar to CAS implementation, but compresses the logic for recalculating the operand
+  // __old = __window[0:32] where [__cmp] resides within some offset.
+  uint32_t __old;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});),
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});))
+
+  // Reemit CAS instructions until we succeed
+  while (1)
+  {
+    // Calculate new desired value from last fetched __old
+    // Use of the value mask is required due to the possibility of overflow when ops are widened. Possible compiler bug?
+    const uint32_t __attempt =
+      ((static_cast<uint32_t>(__op(static_cast<_Type>(__old >> __offset))) << __offset) & __valueMask)
+      | (__old & __windowMask);
+
+    if (__cuda_atomic_compare_exchange(
+          __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}))
     {
-      __dst = __old_value;
-      break;
+      // CAS was successful
+      return static_cast<_Type>(__old >> __offset);
     }
   }
-  return false;
 }
 
-// Lower level fetch_update that bypasses memorder dispatch
-template <class _Type, class _Fn, class _Sco, class _Order, class _Operand>
+// Optimized fetch_update CAS loop with op determined after first load reducing waste.
+template <class _Type,
+          class _Fn,
+          class _Order,
+          class _Operand,
+          class _Sco,
+          __cuda_atomic_enable_native_bitwise<_Operand> = 0>
 _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco)
 {
   _Type __expected = 0;
-  __atomic_load_cuda(__ptr, __expected, __ATOMIC_RELAXED, _Sco{});
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (__cuda_atomic_load(
+       __ptr, __expected, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});),
+    (__cuda_atomic_load(
+       __ptr, __expected, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});))
+
   _Type __desired = __op(__expected);
   while (!__cuda_atomic_compare_exchange(__ptr, __expected, __expected, __desired, _Order{}, _Operand{}, _Sco{}))
   {

From b7b944e3db7724167f6ed8ebab70b910b0e63e88 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Tue, 20 Aug 2024 18:10:58 -0700
Subject: [PATCH 19/29] Fix name of preset for PTX codegen test

---
 CMakePresets.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 61cb88eca8..357c4da496 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -478,7 +478,7 @@
       ],
       "filter": {
         "exclude": {
-          "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.diff)$"
+          "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.ptx)$"
         }
       }
     },

From 2381c4297290e936642ed36698dac97e02ecba97 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 21 Aug 2024 09:29:44 -0700
Subject: [PATCH 20/29] Fix signed/unsigned comparison

---
 .../test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
index e1f62d856f..ab86ca5222 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
@@ -69,13 +69,13 @@ __device__ void device_do_test(uint32_t expected)
   if (threadIdx.x == 0)
   {
     // For each thread, add its atomic result into the corresponding bucket
-    for (int i = 0; i < threadCount; i++)
+    for (uint32_t i = 0; i < threadCount; i++)
     {
       atomHistogram[atomHistory[i]]++;
     }
     // Check that each bucket has exactly (4 / sizeof(T)) entries
     // This checks that atomic fetch operations were sequential. i.e. 4xfetch_add(1) returns [0, 1, 2, 3]
-    for (int i = 0; i < histogramResultCount; i++)
+    for (uint32_t i = 0; i < histogramResultCount; i++)
     {
       assert(atomHistogram[i] == histogramEntriesPerThread);
     }

From 07d1077c41f1a2940bd8668687c0bb2ac9c325c0 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 21 Aug 2024 10:30:30 -0700
Subject: [PATCH 21/29] Fix atomics codegen tests not being built

---
 CMakePresets.json                             | 5 +++--
 libcudacxx/test/CMakeLists.txt                | 1 +
 libcudacxx/test/atomic_codegen/CMakeLists.txt | 2 --
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 357c4da496..10bdd83539 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -356,7 +356,8 @@
         "libcudacxx.test.internal_headers",
         "libcudacxx.test.public_headers",
         "libcudacxx.test.public_headers_host_only",
-        "libcudacxx.test.lit.precompile"
+        "libcudacxx.test.lit.precompile",
+        "libcudacxx.test.atomics.ptx"
       ]
     },
     {
@@ -478,7 +479,7 @@
       ],
       "filter": {
         "exclude": {
-          "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.ptx)$"
+          "name": "^libcudacxx\\.test\\.lit$"
         }
       }
     },
diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt
index b20a0f3fed..2a7b4d7130 100644
--- a/libcudacxx/test/CMakeLists.txt
+++ b/libcudacxx/test/CMakeLists.txt
@@ -43,6 +43,7 @@ endif()
 
 find_program(filecheck "FileCheck")
 
+add_custom_target(libcudacxx.test.atomics.ptx)
 if (filecheck)
   message("-- ${filecheck} found... building atomic codegen tests")
   add_subdirectory(atomic_codegen)
diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt
index 7aee09700a..7cc291a054 100644
--- a/libcudacxx/test/atomic_codegen/CMakeLists.txt
+++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_custom_target(libcudacxx.test.atomics.ptx)
-
 find_program(filecheck "FileCheck" REQUIRED)
 find_program(cuobjdump "cuobjdump" REQUIRED)
 find_program(bash "bash" REQUIRED)

From a2d19d13a1d052f8190b29aa318a10d1812679a5 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 21 Aug 2024 14:42:22 -0700
Subject: [PATCH 22/29] Fix CMake target for libcudacxx ptx tests.

---
 libcudacxx/test/CMakeLists.txt                |  8 +-------
 libcudacxx/test/atomic_codegen/CMakeLists.txt | 11 ++++++++++-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt
index 2a7b4d7130..0b0f18a7c5 100644
--- a/libcudacxx/test/CMakeLists.txt
+++ b/libcudacxx/test/CMakeLists.txt
@@ -41,10 +41,4 @@ if (LIBCUDACXX_TEST_WITH_NVRTC)
   add_subdirectory(utils/nvidia/nvrtc)
 endif()
 
-find_program(filecheck "FileCheck")
-
-add_custom_target(libcudacxx.test.atomics.ptx)
-if (filecheck)
-  message("-- ${filecheck} found... building atomic codegen tests")
-  add_subdirectory(atomic_codegen)
-endif()
+add_subdirectory(atomic_codegen)
diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt
index 7cc291a054..ac674b09b5 100644
--- a/libcudacxx/test/atomic_codegen/CMakeLists.txt
+++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt
@@ -1,4 +1,13 @@
-find_program(filecheck "FileCheck" REQUIRED)
+add_custom_target(libcudacxx.test.atomics.ptx)
+
+find_program(filecheck "FileCheck")
+
+if (filecheck)
+  message("-- ${filecheck} found... building atomic codegen tests")
+else()
+  return()
+endif()
+
 find_program(cuobjdump "cuobjdump" REQUIRED)
 find_program(bash "bash" REQUIRED)
 

From 70aa4a3ba07aa7125c3611d645c5bbcc871707b9 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Tue, 27 Aug 2024 10:15:28 -0700
Subject: [PATCH 23/29] Make dump_and_check executable again

---
 libcudacxx/test/atomic_codegen/dump_and_check.bash | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 libcudacxx/test/atomic_codegen/dump_and_check.bash

diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash
old mode 100644
new mode 100755

From e08ed80759fbb70f85dbe389ba055ed209af47c2 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 28 Aug 2024 13:41:01 -0700
Subject: [PATCH 24/29] Work around inconsistent parsing of [[[ in FileCheck
 versions

---
 libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu   | 6 +++---
 libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu   | 6 +++---
 libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu  | 6 +++---
 libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu  | 4 ++--
 libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu | 4 ++--
 libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu   | 6 +++---
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
index d97636d647..d4b3efe3b5 100644
--- a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
@@ -10,9 +10,9 @@ __global__ void add_relaxed_device_non_volatile(int* data, int* out, int n)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
 ; SM8X-NEXT: //
 ; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
diff --git a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu
index e0b6c3d151..b6c5ff4aeb 100644
--- a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu
@@ -10,9 +10,9 @@ __global__ void cas_device_relaxed_non_volatile(int* data, int* out, int n)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*cas_device_relaxed_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
 ; SM8X-NEXT: ld.global.u32 %r[[#LOCALEXP:]], [%rd[[#INPUT]]];
 ; SM8X-NEXT: //
diff --git a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu
index 787fcd30bb..58a87c3f14 100644
--- a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu
@@ -10,9 +10,9 @@ __global__ void exch_device_relaxed_non_volatile(int* data, int* out, int n)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*exch_device_relaxed_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
 ; SM8X-NEXT: //
 ; SM8X-NEXT: atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
diff --git a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu
index 6bef7f1c2a..42fea95303 100644
--- a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu
@@ -10,8 +10,8 @@ __global__ void load_relaxed_device_non_volatile(int* data, int* out)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*load_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
 ; SM8X-NEXT: //
 ; SM8X-NEXT: ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]];
diff --git a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
index 983c8e9fac..fca00e332a 100644
--- a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
@@ -10,8 +10,8 @@ __global__ void store_relaxed_device_non_volatile(int* data, int in)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*store_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_1];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
 ; SM8X-NEXT: //
 ; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]];
 ; SM8X-NEXT: //
diff --git a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
index 9d1ffaefa1..70b306b9de 100644
--- a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
@@ -10,9 +10,9 @@ __global__ void sub_relaxed_device_non_volatile(int* data, int* out, int n)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*sub_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
 ; SM8X-NEXT: neg.s32 %r[[#NEG:]], %r[[#INPUT]];
 ; SM8X-NEXT: //

From 1000597d738cf165dc8ff5c8bcf285ed665f8850 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:12:21 -0700
Subject: [PATCH 25/29] Make min/max match algorith.min/max.

---
 .../include/cuda/std/__atomic/functions/cuda_ptx_derived.h      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 06e965d4a2..93d4143129 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -249,7 +249,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __d
   __dst = __cuda_atomic_fetch_update(
     __ptr,
     [__op](_Type __old) {
-      return __old > __op ? __old : __op;
+      return __op < __old ? __old : __op;
     },
     _Order{},
     __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},

From b7baeef9aab93fb1c1d51eb1aa9de4cc4dc37d4d Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 11 Sep 2024 13:16:29 -0700
Subject: [PATCH 26/29] Work around NVCC 11.X using different syntax for inline
 ptx

---
 libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu   | 4 +---
 libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu   | 5 ++---
 libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu  | 4 +---
 libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu  | 4 +---
 libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu | 4 +---
 libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu   | 4 +---
 libcudacxx/test/atomic_codegen/dump_and_check.bash          | 2 +-
 7 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
index d4b3efe3b5..9d1317e021 100644
--- a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
@@ -14,9 +14,7 @@ __global__ void add_relaxed_device_non_volatile(int* data, int* out, int n)
 ; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
 ; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu
index b6c5ff4aeb..99c73f4c32 100644
--- a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu
@@ -6,6 +6,7 @@ __global__ void cas_device_relaxed_non_volatile(int* data, int* out, int n)
   ref.compare_exchange_strong(*out, n, cuda::std::memory_order_relaxed);
 }
 
+// clang-format off
 /*
 
 ; SM8X-LABEL: .target sm_80
@@ -15,9 +16,7 @@ __global__ void cas_device_relaxed_non_volatile(int* data, int* out, int n)
 ; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
 ; SM8X-NEXT: ld.global.u32 %r[[#LOCALEXP:]], [%rd[[#INPUT]]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.cas.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#LOCALEXP]],%r[[#INPUT]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}atom.cas.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#LOCALEXP]],%r[[#INPUT]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu
index 58a87c3f14..59c4d52d97 100644
--- a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu
@@ -14,9 +14,7 @@ __global__ void exch_device_relaxed_non_volatile(int* data, int* out, int n)
 ; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
 ; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu
index 42fea95303..804f953294 100644
--- a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu
@@ -13,9 +13,7 @@ __global__ void load_relaxed_device_non_volatile(int* data, int* out)
 ; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
 ; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
-; SM8X-NEXT: //
-; SM8X-NEXT: ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
index fca00e332a..4dcae5d176 100644
--- a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
@@ -12,9 +12,7 @@ __global__ void store_relaxed_device_non_volatile(int* data, int in)
 ; SM8X:      .visible .entry [[FUNCTION:_.*store_relaxed_device_non_volatile.*]](
 ; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
 ; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
-; SM8X-NEXT: //
-; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}}
 ; SM8X-NEXT: ret;
 
 */
diff --git a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
index 70b306b9de..b8a40fee97 100644
--- a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
@@ -15,9 +15,7 @@ __global__ void sub_relaxed_device_non_volatile(int* data, int* out, int n)
 ; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
 ; SM8X-NEXT: neg.s32 %r[[#NEG:]], %r[[#INPUT]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash
index a5c62e91bd..c66bb43363 100755
--- a/libcudacxx/test/atomic_codegen/dump_and_check.bash
+++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash
@@ -6,4 +6,4 @@ input_archive=$1
 input_testfile=$2
 input_prefix=$3
 
-cuobjdump --dump-ptx $input_archive | FileCheck --check-prefix $input_prefix $input_testfile
+cuobjdump --dump-ptx $input_archive | FileCheck --match-full-lines --check-prefix $input_prefix $input_testfile

From c59b3ab1c3eafc933aea0b7fe44af47adc859bae Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 11 Sep 2024 14:27:18 -0700
Subject: [PATCH 27/29] Fix warnings in the codegen tests.

---
 libcudacxx/test/atomic_codegen/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt
index ac674b09b5..d881e823f8 100644
--- a/libcudacxx/test/atomic_codegen/CMakeLists.txt
+++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt
@@ -25,9 +25,12 @@ foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests)
 
     set_target_properties(
         atomic_codegen_${test_name}
-        PROPERTIES CUDA_ARCHITECTURES "80"
+        PROPERTIES
+          CUDA_ARCHITECTURES "80"
     )
 
+    target_compile_options(atomic_codegen_${test_name} PRIVATE "-Wno-comment")
+
     ## Important for testing the local headers
     target_include_directories(atomic_codegen_${test_name} PRIVATE "${libcudacxx_SOURCE_DIR}/include")
     add_dependencies(libcudacxx.test.atomics.ptx atomic_codegen_${test_name})

From 642b4870bfea9f245b287dca2282cc3fd13a9632 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 11 Sep 2024 14:28:10 -0700
Subject: [PATCH 28/29] Use PTX 16b ld/st instead of 32b CAS

---
 libcudacxx/codegen/generators/ld_st.h         |   2 +
 .../std/__atomic/functions/cuda_ptx_derived.h |  10 +-
 .../__atomic/functions/cuda_ptx_generated.h   | 208 ++++++++++++++++++
 3 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/codegen/generators/ld_st.h b/libcudacxx/codegen/generators/ld_st.h
index 43cf80ef5c..d4aec3da54 100644
--- a/libcudacxx/codegen/generators/ld_st.h
+++ b/libcudacxx/codegen/generators/ld_st.h
@@ -92,6 +92,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_load(
 {{ asm volatile("ld{8}{4}{6}.{0}{1} %0,[%1];" : "={2}"(__dst) : "l"(__ptr) : "memory"); }})XXX";
 
   constexpr size_t supported_sizes[] = {
+    16,
     32,
     64,
     128,
@@ -248,6 +249,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_store(
 {{ asm volatile("st{8}{4}{6}.{0}{1} [%0],%1;" :: "l"(__ptr), "{2}"(__val) : "memory"); }})XXX";
 
   constexpr size_t supported_sizes[] = {
+    16,
     32,
     64,
     128,
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 93d4143129..6ecc4f16b2 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -41,7 +41,13 @@ using __cuda_atomic_enable_non_native_bitwise = __enable_if_t<_Operand::__size <
 template <class _Operand>
 using __cuda_atomic_enable_native_bitwise = __enable_if_t<_Operand::__size >= 32, bool>;
 
-template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+template <class _Operand>
+using __cuda_atomic_enable_non_native_ld_st = __enable_if_t<_Operand::__size <= 8, bool>;
+
+template <class _Operand>
+using __cuda_atomic_enable_native_ld_st = __enable_if_t<_Operand::__size >= 16, bool>;
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_ld_st<_Operand> = 0>
 static inline _CCCL_DEVICE void
 __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
@@ -164,7 +170,7 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or
   return __expected;
 }
 
-template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_ld_st<_Operand> = 0>
 static inline _CCCL_DEVICE void
 __cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index 3b164cfc52..01a0f2e3a5 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -116,6 +116,162 @@ static inline _CCCL_DEVICE void __cuda_atomic_load_memory_order_dispatch(_Fn &__
   )
 }
 
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.acquire.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("ld.mmio.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_load(
+  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); }
 template <class _Type>
 static inline _CCCL_DEVICE void __cuda_atomic_load(
   const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
@@ -716,6 +872,58 @@ static inline _CCCL_DEVICE void __cuda_atomic_store_memory_order_dispatch(_Fn &_
   )
 }
 
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.release.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
+{ asm volatile("st.mmio.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
+template <class _Type>
+static inline _CCCL_DEVICE void __cuda_atomic_store(
+  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
+{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); }
 template <class _Type>
 static inline _CCCL_DEVICE void __cuda_atomic_store(
   _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable)

From b1901a281dcfc97af3fbf1d1d416cc218b69cba5 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 11 Sep 2024 14:46:16 -0700
Subject: [PATCH 29/29] Switch 8b ld/st to 16b ld

---
 .../cuda/std/__atomic/functions/cuda_ptx_derived.h     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 6ecc4f16b2..5139b7d603 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -51,13 +51,13 @@ template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_e
 static inline _CCCL_DEVICE void
 __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
 {
-  constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
-  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
-  const uint8_t __offset         = uint32_t((intptr_t) __ptr & __alignmask) * 8;
+  constexpr uint64_t __alignmask = (sizeof(uint16_t) - 1);
+  uint16_t* __aligned            = (uint16_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint16_t((intptr_t) __ptr & __alignmask) * 8;
 
-  uint32_t __value = 0;
+  uint16_t __value = 0;
+  __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}, __atomic_cuda_mmio_disable{});
 
-  __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});
   __dst = static_cast<_Type>(__value >> __offset);
 }