diff --git a/CMakePresets.json b/CMakePresets.json
index bd9374778b..ecc9b22761 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -357,7 +357,8 @@
         "libcudacxx.test.internal_headers",
         "libcudacxx.test.public_headers",
         "libcudacxx.test.public_headers_host_only",
-        "libcudacxx.test.lit.precompile"
+        "libcudacxx.test.lit.precompile",
+        "libcudacxx.test.atomics.ptx"
       ]
     },
     {
@@ -479,7 +480,7 @@
       ],
       "filter": {
         "exclude": {
-          "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.diff)$"
+          "name": "^libcudacxx\\.test\\.lit$"
         }
       }
     },
diff --git a/libcudacxx/codegen/generators/compare_and_swap.h b/libcudacxx/codegen/generators/compare_and_swap.h
index 5a970735c0..17b11ab69c 100644
--- a/libcudacxx/codegen/generators/compare_and_swap.h
+++ b/libcudacxx/codegen/generators/compare_and_swap.h
@@ -83,7 +83,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
   };
 
   constexpr size_t supported_sizes[] = {
-    16,
     32,
     64,
     128,
diff --git a/libcudacxx/codegen/generators/exchange.h b/libcudacxx/codegen/generators/exchange.h
index dcfe66f147..f89a0075ae 100644
--- a/libcudacxx/codegen/generators/exchange.h
+++ b/libcudacxx/codegen/generators/exchange.h
@@ -81,7 +81,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(
   };
 
   constexpr size_t supported_sizes[] = {
-    16,
     32,
     64,
     128,
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 0e525bf296..5139b7d603 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -13,8 +13,6 @@
 
 #include <cuda/std/detail/__config>
 
-#include "cuda_ptx_generated.h"
-
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -33,6 +31,250 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if defined(_CCCL_CUDA_COMPILER)
 
+template <class _Operand>
+using __cuda_atomic_enable_non_native_arithmetic =
+  __enable_if_t<_Operand::__size <= 16 || _Operand::__op == __atomic_cuda_operand::_f, bool>;
+
+template <class _Operand>
+using __cuda_atomic_enable_non_native_bitwise = __enable_if_t<_Operand::__size <= 16, bool>;
+
+template <class _Operand>
+using __cuda_atomic_enable_native_bitwise = __enable_if_t<_Operand::__size >= 32, bool>;
+
+template <class _Operand>
+using __cuda_atomic_enable_non_native_ld_st = __enable_if_t<_Operand::__size <= 8, bool>;
+
+template <class _Operand>
+using __cuda_atomic_enable_native_ld_st = __enable_if_t<_Operand::__size >= 16, bool>;
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_ld_st<_Operand> = 0>
+static inline _CCCL_DEVICE void
+__cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
+{
+  constexpr uint64_t __alignmask = (sizeof(uint16_t) - 1);
+  uint16_t* __aligned            = (uint16_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint16_t((intptr_t) __ptr & __alignmask) * 8;
+
+  uint16_t __value = 0;
+  __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}, __atomic_cuda_mmio_disable{});
+
+  __dst = static_cast<_Type>(__value >> __offset);
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+static inline _CCCL_DEVICE bool
+__cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco)
+{
+  constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
+  constexpr uint32_t __sizemask  = (1 << (sizeof(_Type) * 8)) - 1;
+  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint32_t((intptr_t) __ptr & __alignmask) * 8;
+  const uint32_t __valueMask     = __sizemask << __offset;
+  const uint32_t __windowMask    = ~__valueMask;
+  const uint32_t __cmpOffset     = __cmp << __offset;
+  const uint32_t __opOffset      = __op << __offset;
+
+  // Algorithm for 8b CAS with 32b intrinsics
+  // __old = __window[0:32] where [__cmp] resides within some offset.
+  uint32_t __old;
+  // Start by loading __old with the current value, this optimizes for early return when __cmp is wrong
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});),
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});))
+  // Reemit CAS instructions until we succeed or the old value is a mismatch
+  while (__cmpOffset == (__old & __valueMask))
+  {
+    // Combine the desired value and most recently fetched expected masked portion of the window
+    const uint32_t __attempt = (__old & __windowMask) | __opOffset;
+
+    if (__cuda_atomic_compare_exchange(
+          __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}))
+    {
+      // CAS was successful
+      return true;
+    }
+  }
+  __dst = static_cast<_Type>(__old >> __offset);
+  return false;
+}
+
+// Optimized fetch_update CAS loop with op determined after first load reducing waste.
+template <class _Type,
+          class _Fn,
+          class _Order,
+          class _Operand,
+          class _Sco,
+          __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+_CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco)
+{
+  constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
+  constexpr uint32_t __sizemask  = (1 << (sizeof(_Type) * 8)) - 1;
+  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint8_t((intptr_t) __ptr & __alignmask) * 8;
+  const uint32_t __valueMask     = __sizemask << __offset;
+  const uint32_t __windowMask    = ~__valueMask;
+
+  // 8/16b fetch update is similar to CAS implementation, but compresses the logic for recalculating the operand
+  // __old = __window[0:32] where [__cmp] resides within some offset.
+  uint32_t __old;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});),
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});))
+
+  // Reemit CAS instructions until we succeed
+  while (1)
+  {
+    // Calculate new desired value from last fetched __old
+    // Use of the value mask is required due to the possibility of overflow when ops are widened. Possible compiler bug?
+    const uint32_t __attempt =
+      ((static_cast<uint32_t>(__op(static_cast<_Type>(__old >> __offset))) << __offset) & __valueMask)
+      | (__old & __windowMask);
+
+    if (__cuda_atomic_compare_exchange(
+          __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}))
+    {
+      // CAS was successful
+      return static_cast<_Type>(__old >> __offset);
+    }
+  }
+}
+
+// Optimized fetch_update CAS loop with op determined after first load reducing waste.
+template <class _Type,
+          class _Fn,
+          class _Order,
+          class _Operand,
+          class _Sco,
+          __cuda_atomic_enable_native_bitwise<_Operand> = 0>
+_CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco)
+{
+  _Type __expected = 0;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (__cuda_atomic_load(
+       __ptr, __expected, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});),
+    (__cuda_atomic_load(
+       __ptr, __expected, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});))
+
+  _Type __desired = __op(__expected);
+  while (!__cuda_atomic_compare_exchange(__ptr, __expected, __expected, __desired, _Order{}, _Operand{}, _Sco{}))
+  {
+    __desired = __op(__expected);
+  }
+  return __expected;
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_ld_st<_Operand> = 0>
+static inline _CCCL_DEVICE void
+__cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable)
+{
+  // Store requires cas on 8/16b types
+  __cuda_atomic_fetch_update(
+    __ptr,
+    [__val](_Type __old) {
+      return __val;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old + __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old & __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old ^ __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old | __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __old < __op ? __old : __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __op < __old ? __old : __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
+{
+  __dst = __cuda_atomic_fetch_update(
+    __ptr,
+    [__op](_Type __old) {
+      return __op;
+    },
+    _Order{},
+    __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
+    _Sco{});
+}
+
 template <typename _Tp, typename _Fn, typename _Sco>
 _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco)
 {
@@ -98,7 +340,7 @@ _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __
 }
 
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
-_CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
   return __atomic_fetch_update_cuda(
     __ptr,
@@ -109,7 +351,7 @@ _CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder
     _Sco{});
 }
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
-_CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
   return __atomic_fetch_update_cuda(
     __ptr,
@@ -121,7 +363,7 @@ _CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int _
 }
 
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
-_CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
   return __atomic_fetch_update_cuda(
     __ptr,
@@ -132,7 +374,7 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorde
     _Sco{});
 }
 template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
-_CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
 {
   return __atomic_fetch_update_cuda(
     __ptr,
@@ -143,262 +385,6 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int
     _Sco{});
 }
 
-// template <typename _Tp,
-//           typename _Sco,
-//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-// _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
-//   void volatile* __ptr,
-//   _Tp* __expected,
-//   const _Tp __desired,
-//   bool __weak,
-//   int __success_memorder,
-//   int __failure_memorder,
-//   _Sco)
-// {
-//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-//   __proxy_t __old = 0;
-//   __proxy_t __new = 0;
-//   memcpy(&__old, __expected, sizeof(__proxy_t));
-//   memcpy(&__new, &__desired, sizeof(__proxy_t));
-//   bool __result =
-//     __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
-//   memcpy(__expected, &__old, sizeof(__proxy_t));
-//   return __result;
-// }
-// template <typename _Tp,
-//           typename _Sco,
-//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-// _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
-//   void* __ptr, _Tp* __expected, const _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder,
-//   _Sco)
-// {
-//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-//   __proxy_t __old = 0;
-//   __proxy_t __new = 0;
-//   memcpy(&__old, __expected, sizeof(__proxy_t));
-//   memcpy(&__new, &__desired, sizeof(__proxy_t));
-//   bool __result =
-//     __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
-//   memcpy(__expected, &__old, sizeof(__proxy_t));
-//   return __result;
-// }
-// template <typename _Tp,
-//           typename _Sco,
-//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-// _CCCL_DEVICE void __atomic_exchange_cuda(void volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
-// {
-//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-//   __proxy_t __old = 0;
-//   __proxy_t __new = 0;
-//   memcpy(&__new, __val, sizeof(__proxy_t));
-//   __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
-//   memcpy(__ret, &__old, sizeof(__proxy_t));
-// }
-// template <typename _Tp,
-//           typename _Sco,
-//           __enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
-// _CCCL_DEVICE void __atomic_exchange_cuda(void* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
-// {
-//   using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
-//   __proxy_t __old = 0;
-//   __proxy_t __new = 0;
-//   memcpy(&__new, __val, sizeof(__proxy_t));
-//   __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
-//   memcpy(__ret, &__old, sizeof(__proxy_t));
-// }
-
-// template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
-//   _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder,
-//   _Sco)
-// {
-//   auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
-//   auto const __offset  = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
-//   auto const __mask    = ((1 << sizeof(_Tp) * 8) - 1) << __offset;
-
-//   uint32_t __old = *__expected << __offset;
-//   uint32_t __old_value;
-//   while (1)
-//   {
-//     __old_value = (__old & __mask) >> __offset;
-//     if (__old_value != *__expected)
-//     {
-//       break;
-//     }
-//     uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset);
-//     if (__atomic_compare_exchange_cuda(
-//           __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{}))
-//     {
-//       return true;
-//     }
-//   }
-//   *__expected = __old_value;
-//   return false;
-// }
-
-// template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{}))
-//     ;
-//   *__ret = __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected + __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected + __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp,
-//           typename _Up,
-//           typename _Sco,
-//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected > __val ? __expected : __val;
-
-//   while (__desired == __val
-//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected > __val ? __expected : __val;
-//   }
-
-//   return __expected;
-// }
-// template <typename _Tp,
-//           typename _Up,
-//           typename _Sco,
-//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected > __val ? __expected : __val;
-
-//   while (__desired == __val
-//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected > __val ? __expected : __val;
-//   }
-
-//   return __expected;
-// }
-
-// template <typename _Tp,
-//           typename _Up,
-//           typename _Sco,
-//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected < __val ? __expected : __val;
-
-//   while (__desired == __val
-//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected < __val ? __expected : __val;
-//   }
-
-//   return __expected;
-// }
-// template <typename _Tp,
-//           typename _Up,
-//           typename _Sco,
-//           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected < __val ? __expected : __val;
-
-//   while (__desired == __val
-//          && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected < __val ? __expected : __val;
-//   }
-
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected - __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected - __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected & __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected & __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected ^ __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected ^ __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-// _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
-// {
-//   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
-//   _Tp __desired  = __expected | __val;
-//   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
-//   {
-//     __desired = __expected | __val;
-//   }
-//   return __expected;
-// }
-
-// template <typename _Tp, typename _Sco>
-// _CCCL_DEVICE bool __atomic_compare_exchange_n_cuda(
-//   _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder,
-//   _Sco)
-// {
-//   return __atomic_compare_exchange_cuda(
-//     __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{});
-// }
-
-// template <typename _Tp, typename _Sco>
-// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
-// {
-//   _Tp __ret;
-//   __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
-//   return __ret;
-// }
-// template <typename _Tp, typename _Sco>
-// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco)
-// {
-//   _Tp __ret;
-//   __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
-//   return __ret;
-// }
-
 _CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int)
 {
   asm volatile("" ::: "memory");
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index e72144b68c..01a0f2e3a5 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -1220,86 +1220,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_swap_memory_order_dispatch
   return __res;
 }
 
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.acquire.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.acquire.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.acquire.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.acquire.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.relaxed.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.relaxed.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.relaxed.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.relaxed.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.release.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.release.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.release.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.release.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.acq_rel.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.acq_rel.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.acq_rel.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.acq_rel.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.cas.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.cas.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.cas.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
-template <class _Type>
-static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
-  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.cas.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; }
 template <class _Type>
 static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
   _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag)
@@ -1763,86 +1683,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange_memory_order_dispatch(_Fn
   )
 }
 
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.acquire.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.acquire.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.acquire.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.acquire.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.relaxed.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.relaxed.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.relaxed.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.relaxed.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.release.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.release.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.release.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.release.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.acq_rel.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.acq_rel.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.acq_rel.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.acq_rel.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag)
-{ asm volatile("atom.exch.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag)
-{ asm volatile("atom.exch.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag)
-{ asm volatile("atom.exch.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
-template <class _Type>
-static inline _CCCL_DEVICE void __cuda_atomic_exchange(
-  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag)
-{ asm volatile("atom.exch.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); }
 template <class _Type>
 static inline _CCCL_DEVICE void __cuda_atomic_exchange(
   _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag)
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
index 861e9f7b08..9a4c2d5bfc 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
@@ -69,8 +69,15 @@ enum class __atomic_cuda_operand
 
 template <__atomic_cuda_operand _Op, size_t _Size>
 struct __atomic_cuda_operand_tag
-{};
+{
+  static constexpr auto __op   = _Op;
+  static constexpr auto __size = _Size;
+};
 
+using __atomic_cuda_operand_f8   = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 8>;
+using __atomic_cuda_operand_s8   = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 8>;
+using __atomic_cuda_operand_u8   = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 8>;
+using __atomic_cuda_operand_b8   = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 8>;
 using __atomic_cuda_operand_f16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 16>;
 using __atomic_cuda_operand_s16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 16>;
 using __atomic_cuda_operand_u16  = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 16>;
@@ -103,13 +110,15 @@ struct __atomic_longlong2
 
 template <class _Type>
 using __atomic_cuda_deduce_bitwise =
-  _If<sizeof(_Type) == 2,
-      __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_b16>,
-      _If<sizeof(_Type) == 4,
-          __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_b32>,
-          _If<sizeof(_Type) == 8,
-              __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_b64>,
-              __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>;
+  _If<sizeof(_Type) == 1,
+      __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_b8>,
+      _If<sizeof(_Type) == 2,
+          __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_b16>,
+          _If<sizeof(_Type) == 4,
+              __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_b32>,
+              _If<sizeof(_Type) == 8,
+                  __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_b64>,
+                  __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>>;
 
 template <class _Type>
 using __atomic_cuda_deduce_arithmetic =
@@ -118,37 +127,56 @@ using __atomic_cuda_deduce_arithmetic =
           __atomic_cuda_operand_deduction<float, __atomic_cuda_operand_f32>,
           __atomic_cuda_operand_deduction<double, __atomic_cuda_operand_f64>>,
       _If<_CCCL_TRAIT(is_signed, _Type),
-          _If<sizeof(_Type) == 4,
-              __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
-              __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_u64>>, // There is no atom.add.s64
-          _If<sizeof(_Type) == 4,
-              __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
-              __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>;
+          _If<sizeof(_Type) == 1,
+              __atomic_cuda_operand_deduction<int8_t, __atomic_cuda_operand_s8>,
+              _If<sizeof(_Type) == 2,
+                  __atomic_cuda_operand_deduction<int16_t, __atomic_cuda_operand_s16>,
+                  _If<sizeof(_Type) == 4,
+                      __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
+                      __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_u64>>>>, // There is no
+                                                                                              // atom.add.s64
+          _If<sizeof(_Type) == 1,
+              __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_u8>,
+              _If<sizeof(_Type) == 2,
+                  __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_u16>,
+                  _If<sizeof(_Type) == 4,
+                      __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
+                      __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>>;
 
 template <class _Type>
 using __atomic_cuda_deduce_minmax =
-  _If<_CCCL_TRAIT(is_signed, _Type),
-      _If<sizeof(_Type) == 4,
-          __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
-          __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_s64>>,
+  _If<_CCCL_TRAIT(is_floating_point, _Type),
       _If<sizeof(_Type) == 4,
-          __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
-          __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>;
+          __atomic_cuda_operand_deduction<float, __atomic_cuda_operand_f32>,
+          __atomic_cuda_operand_deduction<double, __atomic_cuda_operand_f64>>,
+      _If<_CCCL_TRAIT(is_signed, _Type),
+          _If<sizeof(_Type) == 1,
+              __atomic_cuda_operand_deduction<int8_t, __atomic_cuda_operand_s8>,
+              _If<sizeof(_Type) == 2,
+                  __atomic_cuda_operand_deduction<int16_t, __atomic_cuda_operand_s16>,
+                  _If<sizeof(_Type) == 4,
+                      __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
+                      __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_s64>>>>, // atom.min|max.s64
+                                                                                              // supported
+          _If<sizeof(_Type) == 1,
+              __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_u8>,
+              _If<sizeof(_Type) == 2,
+                  __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_u16>,
+                  _If<sizeof(_Type) == 4,
+                      __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
+                      __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>>;
 
 template <class _Type>
 using __atomic_enable_if_native_bitwise = bool;
 
 template <class _Type>
-using __atomic_enable_if_native_arithmetic = typename enable_if<_CCCL_TRAIT(is_scalar, _Type), bool>::type;
-
-template <class _Type>
-using __atomic_enable_if_not_native_arithmetic = typename enable_if<!_CCCL_TRAIT(is_scalar, _Type), bool>::type;
+using __atomic_enable_if_native_arithmetic = __enable_if_t<_CCCL_TRAIT(is_scalar, _Type), bool>;
 
 template <class _Type>
-using __atomic_enable_if_native_minmax = typename enable_if<_CCCL_TRAIT(is_integral, _Type), bool>::type;
+using __atomic_enable_if_native_minmax = __enable_if_t<_CCCL_TRAIT(is_integral, _Type), bool>;
 
 template <class _Type>
-using __atomic_enable_if_not_native_minmax = typename enable_if<!_CCCL_TRAIT(is_integral, _Type), bool>::type;
+using __atomic_enable_if_not_native_minmax = __enable_if_t<!_CCCL_TRAIT(is_integral, _Type), bool>;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index 8afa9756ef..33d11fd9ad 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -72,9 +72,9 @@ static inline void __atomic_thread_fence(int __memorder)
 }
 
 template <typename _Type, size_t _Size>
-using _enable_if_sized_as = typename enable_if<sizeof(_Type) == _Size, int>::type;
+using __enable_if_sized_as = __enable_if_t<sizeof(_Type) == _Size, int>;
 
-template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
 #  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
@@ -84,7 +84,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 #  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
-template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
 #  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
@@ -94,7 +94,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 #  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
-template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
 #  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
@@ -104,7 +104,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 #  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
-template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
 #  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
@@ -136,7 +136,7 @@ void __atomic_load(const volatile _Type* __ptr, _Type* __ret, int __memorder)
   }
 }
 
-template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int8*>(__val);
@@ -147,7 +147,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
   __iso_volatile_store8(__d, *__t);
 #  endif
 }
-template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int16*>(__val);
@@ -158,7 +158,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
   __iso_volatile_store16(__d, *__t);
 #  endif
 }
-template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int32*>(__val);
@@ -170,7 +170,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
   __iso_volatile_store32(__d, *__t);
 #  endif
 }
-template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int64*>(__val);
@@ -202,7 +202,7 @@ void __atomic_store(volatile _Type* __ptr, _Type* __val, int __memorder)
   }
 }
 
-template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 1> = 0>
 bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired)
 {
   auto __tmp_desired  = reinterpret_cast<const char&>(*__desired);
@@ -215,7 +215,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp
   *__expected = reinterpret_cast<const _Type&>(__old);
   return false;
 }
-template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 2> = 0>
 bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired)
 {
   auto __tmp_desired  = reinterpret_cast<const short&>(*__desired);
@@ -228,7 +228,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp
   *__expected = reinterpret_cast<const _Type&>(__old);
   return false;
 }
-template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 4> = 0>
 bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired)
 {
   auto __tmp_desired  = reinterpret_cast<const long&>(*__desired);
@@ -241,7 +241,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp
   *__expected = reinterpret_cast<const _Type&>(__old);
   return false;
 }
-template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 8> = 0>
 bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired)
 {
   auto __tmp_desired  = reinterpret_cast<const __int64&>(*__desired);
@@ -287,25 +287,25 @@ bool __atomic_compare_exchange(
   return success;
 }
 
-template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchange8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchange16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchange((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchange64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));
@@ -341,25 +341,25 @@ void __atomic_exchange(_Type volatile* __ptr, const _Type* __val, _Type* __ret,
   }
 }
 
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchangeAdd8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchangeAdd16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchangeAdd((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedExchangeAdd64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));
@@ -404,25 +404,25 @@ _Type __atomic_fetch_sub(_Type volatile* __ptr, _Delta __val, int __memorder)
   return __atomic_fetch_add(__ptr, 0 - __val, __memorder);
 }
 
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedAnd8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedAnd16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedAnd((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedAnd64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));
@@ -462,25 +462,25 @@ _Type __atomic_fetch_and(_Type volatile* __ptr, _Delta __val, int __memorder)
   return *__dest;
 }
 
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedXor8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedXor16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedXor((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedXor64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));
@@ -520,25 +520,25 @@ _Type __atomic_fetch_xor(_Type volatile* __ptr, _Delta __val, int __memorder)
   return *__dest;
 }
 
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 1> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 1> = 0>
 void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedOr8((volatile char*) __ptr, reinterpret_cast<char const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 2> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 2> = 0>
 void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedOr16((volatile short*) __ptr, reinterpret_cast<short const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 4> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 4> = 0>
 void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedOr((volatile long*) __ptr, reinterpret_cast<long const&>(*__val));
   *__ret           = reinterpret_cast<_Type const&>(__old);
 }
-template <class _Type, class _Delta, _enable_if_sized_as<_Type, 8> = 0>
+template <class _Type, class _Delta, __enable_if_sized_as<_Type, 8> = 0>
 void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret)
 {
   auto const __old = _InterlockedOr64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val));
diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
index 4b58ba4901..73e5ef0044 100644
--- a/libcudacxx/include/cuda/std/__atomic/types.h
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -32,10 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <typename _Tp>
 struct __atomic_traits
 {
-  static constexpr bool __atomic_requires_lock  = !__atomic_is_always_lock_free<_Tp>::__value;
-  static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
-  static constexpr bool __atomic_supports_reference =
-    __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8);
+  static constexpr bool __atomic_requires_lock      = !__atomic_is_always_lock_free<_Tp>::__value;
+  static constexpr bool __atomic_requires_small     = sizeof(_Tp) < 4;
+  static constexpr bool __atomic_supports_reference = __atomic_is_always_lock_free<_Tp>::__value && sizeof(_Tp) <= 8;
 };
 
 template <typename _Tp>
diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt
index b20a0f3fed..0b0f18a7c5 100644
--- a/libcudacxx/test/CMakeLists.txt
+++ b/libcudacxx/test/CMakeLists.txt
@@ -41,9 +41,4 @@ if (LIBCUDACXX_TEST_WITH_NVRTC)
   add_subdirectory(utils/nvidia/nvrtc)
 endif()
 
-find_program(filecheck "FileCheck")
-
-if (filecheck)
-  message("-- ${filecheck} found... building atomic codegen tests")
-  add_subdirectory(atomic_codegen)
-endif()
+add_subdirectory(atomic_codegen)
diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt
index 095fa41cf7..d881e823f8 100644
--- a/libcudacxx/test/atomic_codegen/CMakeLists.txt
+++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt
@@ -1,6 +1,13 @@
 add_custom_target(libcudacxx.test.atomics.ptx)
 
-find_program(filecheck "FileCheck" REQUIRED)
+find_program(filecheck "FileCheck")
+
+if (filecheck)
+  message("-- ${filecheck} found... building atomic codegen tests")
+else()
+  return()
+endif()
+
 find_program(cuobjdump "cuobjdump" REQUIRED)
 find_program(bash "bash" REQUIRED)
 
@@ -16,6 +23,14 @@ foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests)
         STATIC ${test_path}
     )
 
+    set_target_properties(
+        atomic_codegen_${test_name}
+        PROPERTIES
+          CUDA_ARCHITECTURES "80"
+    )
+
+    target_compile_options(atomic_codegen_${test_name} PRIVATE "-Wno-comment")
+
     ## Important for testing the local headers
     target_include_directories(atomic_codegen_${test_name} PRIVATE "${libcudacxx_SOURCE_DIR}/include")
     add_dependencies(libcudacxx.test.atomics.ptx atomic_codegen_${test_name})
diff --git a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
index d97636d647..9d1317e021 100644
--- a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu
@@ -10,13 +10,11 @@ __global__ void add_relaxed_device_non_volatile(int* data, int* out, int n)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu
index e0b6c3d151..99c73f4c32 100644
--- a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu
@@ -6,18 +6,17 @@ __global__ void cas_device_relaxed_non_volatile(int* data, int* out, int n)
   ref.compare_exchange_strong(*out, n, cuda::std::memory_order_relaxed);
 }
 
+// clang-format off
 /*
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*cas_device_relaxed_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
 ; SM8X-NEXT: ld.global.u32 %r[[#LOCALEXP:]], [%rd[[#INPUT]]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.cas.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#LOCALEXP]],%r[[#INPUT]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}atom.cas.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#LOCALEXP]],%r[[#INPUT]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu
index 787fcd30bb..59c4d52d97 100644
--- a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu
@@ -10,13 +10,11 @@ __global__ void exch_device_relaxed_non_volatile(int* data, int* out, int n)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*exch_device_relaxed_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu
index 6bef7f1c2a..804f953294 100644
--- a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu
@@ -10,12 +10,10 @@ __global__ void load_relaxed_device_non_volatile(int* data, int* out)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*load_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]];
-; SM8X-NEXT: //
-; SM8X-NEXT: ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
index 983c8e9fac..4dcae5d176 100644
--- a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu
@@ -10,11 +10,9 @@ __global__ void store_relaxed_device_non_volatile(int* data, int in)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*store_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_1];
-; SM8X-NEXT: //
-; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]];
-; SM8X-NEXT: //
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
+; SM8X-NEXT: {{/*[[:space:]] *}}st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}}
 ; SM8X-NEXT: ret;
 
 */
diff --git a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
index 9d1ffaefa1..b8a40fee97 100644
--- a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
+++ b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu
@@ -10,14 +10,12 @@ __global__ void sub_relaxed_device_non_volatile(int* data, int* out, int n)
 
 ; SM8X-LABEL: .target sm_80
 ; SM8X:      .visible .entry [[FUNCTION:_.*sub_relaxed_device_non_volatile.*]](
-; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0];
-; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1];
-; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2];
+; SM8X-DAG:  ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}}
+; SM8X-DAG:  ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}}
+; SM8X-DAG:  ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}}
 ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]];
 ; SM8X-NEXT: neg.s32 %r[[#NEG:]], %r[[#INPUT]];
-; SM8X-NEXT: //
-; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]];
-; SM8X-NEXT: //
+; SM8X-NEXT: {{/*[[:space:]] *}}atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]];{{[[:space:]]/*}}
 ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]];
 ; SM8X-NEXT: ret;
 
diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash
old mode 100644
new mode 100755
index a5c62e91bd..c66bb43363
--- a/libcudacxx/test/atomic_codegen/dump_and_check.bash
+++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash
@@ -6,4 +6,4 @@ input_archive=$1
 input_testfile=$2
 input_prefix=$3
 
-cuobjdump --dump-ptx $input_archive | FileCheck --check-prefix $input_prefix $input_testfile
+cuobjdump --dump-ptx $input_archive | FileCheck --match-full-lines --check-prefix $input_prefix $input_testfile
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
new file mode 100644
index 0000000000..ab86ca5222
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp
@@ -0,0 +1,96 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
+// UNSUPPORTED: windows && pre-sm-70
+
+// <cuda/atomic>
+
+#include <cuda/atomic>
+#include <cuda/std/cassert>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+/*
+Test goals:
+Interleaved 8b/16b access to a 32b window while there is thread contention.
+
+for 8b:
+Launch 1024 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption
+caused by interleaved access to different parts of the window.
+
+for 16b:
+Launch 1024 threads, fetch_add(1), checking for 0x01FF01FF.
+*/
+
+template <class T, int Inc>
+__host__ __device__ void fetch_add_into_window(T* window, uint16_t* atomHistory)
+{
+  typedef cuda::atomic_ref<T, cuda::thread_scope_block> Atom;
+
+  Atom a(*window);
+  *atomHistory = a.fetch_add(Inc);
+}
+
+template <class T>
+__device__ void device_do_test(uint32_t expected)
+{
+  constexpr uint32_t threadCount               = 1024;
+  constexpr uint32_t histogramResultCount      = 256 * sizeof(T);
+  constexpr uint32_t histogramEntriesPerThread = 4 / sizeof(T);
+
+  __shared__ uint16_t atomHistory[threadCount];
+  __shared__ uint8_t atomHistogram[histogramResultCount];
+  __shared__ uint32_t atomicStorage;
+
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_block> bucket(atomicStorage);
+
+  constexpr uint32_t offsetMask = ((4 / sizeof(T)) - 1);
+  // Access offset is interleaved meaning threads 4, 5, 6, 7 access window 0, 1, 2, 3 and so on.
+  const uint32_t threadOffset = threadIdx.x & offsetMask;
+
+  if (threadIdx.x == 0)
+  {
+    memset(atomHistogram, 0, histogramResultCount);
+    bucket.store(0);
+  }
+  __syncthreads();
+
+  T* window = reinterpret_cast<T*>(&atomicStorage) + threadOffset;
+  fetch_add_into_window<T, 1>(window, atomHistory + threadIdx.x);
+
+  __syncthreads();
+  if (threadIdx.x == 0)
+  {
+    // For each thread, add its atomic result into the corresponding bucket
+    for (uint32_t i = 0; i < threadCount; i++)
+    {
+      atomHistogram[atomHistory[i]]++;
+    }
+    // Check that each bucket has exactly (4 / sizeof(T)) entries
+    // This checks that atomic fetch operations were sequential. i.e. 4xfetch_add(1) returns [0, 1, 2, 3]
+    for (uint32_t i = 0; i < histogramResultCount; i++)
+    {
+      assert(atomHistogram[i] == histogramEntriesPerThread);
+    }
+    printf("expected: 0x%X\r\n", expected);
+    printf("result:   0x%X\r\n", bucket.load());
+    assert(bucket.load() == expected);
+  }
+}
+
+int main(int, char**)
+{
+  NV_DISPATCH_TARGET(NV_IS_HOST,
+                     (cuda_thread_count = 1024;),
+                     NV_IS_DEVICE,
+                     (device_do_test<uint8_t>(0); device_do_test<uint16_t>(0x02000200);));
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp
index ddef76ec28..4cae377b27 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp
@@ -175,19 +175,14 @@ using bitwise_atomic_testers =
 
 void kernel_invoker()
 {
-// todo
-#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
   validate_pinned<signed char, arithmetic_atomic_testers>();
   validate_pinned<signed short, arithmetic_atomic_testers>();
-#endif
   validate_pinned<signed int, arithmetic_atomic_testers>();
   validate_pinned<signed long, arithmetic_atomic_testers>();
   validate_pinned<signed long long, arithmetic_atomic_testers>();
 
-#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
   validate_pinned<unsigned char, bitwise_atomic_testers>();
   validate_pinned<unsigned short, bitwise_atomic_testers>();
-#endif
   validate_pinned<unsigned int, bitwise_atomic_testers>();
   validate_pinned<unsigned long, bitwise_atomic_testers>();
   validate_pinned<unsigned long long, bitwise_atomic_testers>();
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp
index 9da187cee7..366b93b82f 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp
@@ -167,19 +167,14 @@ using bitwise_atomic_testers =
 
 void kernel_invoker()
 {
-// todo
-#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
   validate_pinned<signed char, arithmetic_atomic_testers>();
   validate_pinned<signed short, arithmetic_atomic_testers>();
-#endif
   validate_pinned<signed int, arithmetic_atomic_testers>();
   validate_pinned<signed long, arithmetic_atomic_testers>();
   validate_pinned<signed long long, arithmetic_atomic_testers>();
 
-#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
   validate_pinned<unsigned char, bitwise_atomic_testers>();
   validate_pinned<unsigned short, bitwise_atomic_testers>();
-#endif
   validate_pinned<unsigned int, bitwise_atomic_testers>();
   validate_pinned<unsigned long, bitwise_atomic_testers>();
   validate_pinned<unsigned long long, bitwise_atomic_testers>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
index 56153f3664..5ef493b6dd 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
@@ -177,6 +177,10 @@ __host__ __device__ void test_for_all_types()
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
   test<Atomic<char32_t, Scope>, char32_t, Selector>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
+  test<Atomic<int8_t, Scope>, int8_t, Selector>();
+  test<Atomic<uint8_t, Scope>, uint8_t, Selector>();
+  test<Atomic<int16_t, Scope>, int16_t, Selector>();
+  test<Atomic<uint16_t, Scope>, uint16_t, Selector>();
   test<Atomic<int32_t, Scope>, int32_t, Selector>();
   test<Atomic<uint32_t, Scope>, uint32_t, Selector>();
   test<Atomic<int64_t, Scope>, int64_t, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
index b237c862a5..57c4a10f53 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
@@ -177,6 +177,10 @@ __host__ __device__ void test_for_all_types()
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
   test<Atomic<char32_t, Scope>, char32_t, Selector>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
+  test<Atomic<int8_t, Scope>, int8_t, Selector>();
+  test<Atomic<uint8_t, Scope>, uint8_t, Selector>();
+  test<Atomic<int16_t, Scope>, int16_t, Selector>();
+  test<Atomic<uint16_t, Scope>, uint16_t, Selector>();
   test<Atomic<int32_t, Scope>, int32_t, Selector>();
   test<Atomic<uint32_t, Scope>, uint32_t, Selector>();
   test<Atomic<int64_t, Scope>, int64_t, Selector>();