From 9941a44887e7cf36491e82aec7c9d74eb3e81671 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 24 Jul 2024 17:04:22 -0700 Subject: [PATCH 01/29] Support fetch_add and CAS on 8/16b --- .../std/__atomic/functions/cuda_ptx_derived.h | 118 +++++++++++++----- .../functions/cuda_ptx_generated_helper.h | 46 ++++--- libcudacxx/include/cuda/std/__atomic/types.h | 7 +- libcudacxx/test/atomic_codegen/CMakeLists.txt | 5 + .../test/atomic_codegen/atomic_add_8b.cu | 23 ++++ 5 files changed, 152 insertions(+), 47 deletions(-) create mode 100644 libcudacxx/test/atomic_codegen/atomic_add_8b.cu diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 0e525bf296..25388a8273 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -13,6 +13,9 @@ #include +#include + +#include "cuda/std/__atomic/functions/cuda_ptx_generated_helper.h" #include "cuda_ptx_generated.h" #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) @@ -33,6 +36,92 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD #if defined(_CCCL_CUDA_COMPILER) +template +using __atomic_cuda_enable_non_native_load = typename enable_if<_Operand::__size <= 8, bool>::type; + +template = 0> +static inline _CCCL_DEVICE bool +__cuda_atomic_load(_Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) +{ + uint16_t* __aligned = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1)); + const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8; + const uint16_t __mask = ((1 << 8) - 1) << __offset; + + uint16_t __value = 0; + + __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}, __atomic_cuda_mmio_disable{}); + __dst = static_cast<_Type>(__value >> __offset); +} + +template +static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, __atomic_cuda_operand_b8, _Sco) +{ + uint16_t* __aligned = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1)); + const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8; + const uint16_t __mask = ((1 << 8) - 1) << __offset; + + // Algorithm for 8b CAS with 16b intrinsics + // __old = __window[0:16] where [__cmp] resides in either of the two 8b offsets + // First CAS attempt 'guesses' that the masked portion of the window is 0x00. + uint16_t __old = (uint16_t(__op) << __offset); + uint16_t __old_value = 0; + + bool __success = false; + + // Reemit CAS instructions until either of two conditions are met + while (1) + { + // Combine the desired value and most recently fetched expected masked portion of the window + uint16_t __attempt = (__old & ~__mask) | (uint16_t(__op) << __offset); + + if (__cuda_atomic_compare_exchange( + __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b16{}, _Sco{})) + { + // CAS was successful + return true; + } + __old_value = (__old & __mask) >> __offset; + // The expected value no longer matches inside the CAS. + if (__old_value != __cmp) + { + __dst = __old_value; + break; + } + } + return false; +} + +// Lower level fetch_update that bypasses memorder dispatch +template +_CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco) +{ + _Type __expected = 0; + __cuda_atomic_load(__ptr, __expected, __atomic_cuda_relaxed{}, _Operand{}, _Sco{}, __atomic_cuda_mmio_disable{}); + _Type __desired = __op(__expected); + while (!__cuda_atomic_compare_exchange(__ptr, __expected, __expected, __desired, _Order{}, _Operand{}, _Sco{})) + { + __desired = __op(__expected); + } + return __expected; +} + +template +using __atomic_cuda_enable_non_native_add = typename enable_if<_Operand::__size <= 16, bool>::type; + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old + __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + template _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco) { @@ -207,35 +296,6 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int // memcpy(__ret, &__old, sizeof(__proxy_t)); // } -// template = 0> -// _CCCL_DEVICE bool __atomic_compare_exchange_cuda( -// _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, -// _Sco) -// { -// auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1)); -// auto const __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8; -// auto const __mask = ((1 << sizeof(_Tp) * 8) - 1) << __offset; - -// uint32_t __old = *__expected << __offset; -// uint32_t __old_value; -// while (1) -// { -// __old_value = (__old & __mask) >> __offset; -// if (__old_value != *__expected) -// { -// break; -// } -// uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset); -// if (__atomic_compare_exchange_cuda( -// __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{})) -// { -// return true; -// } -// } -// *__expected = __old_value; -// return false; -// } - // template = 0> // _CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) // { diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h index 861e9f7b08..cc3f3f4659 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h @@ -69,8 +69,15 @@ enum class __atomic_cuda_operand template <__atomic_cuda_operand _Op, size_t _Size> struct __atomic_cuda_operand_tag -{}; +{ + static constexpr auto __op = _Op; + static constexpr auto __size = _Size; +}; +using __atomic_cuda_operand_f8 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 8>; +using __atomic_cuda_operand_s8 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 8>; +using __atomic_cuda_operand_u8 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 8>; +using __atomic_cuda_operand_b8 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 8>; using __atomic_cuda_operand_f16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 16>; using __atomic_cuda_operand_s16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 16>; using __atomic_cuda_operand_u16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 16>; @@ -103,13 +110,15 @@ struct __atomic_longlong2 template using __atomic_cuda_deduce_bitwise = - _If, - _If, - _If, - __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>; + _If, + _If, + _If, + _If, + __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>>; template using __atomic_cuda_deduce_arithmetic = @@ -118,12 +127,21 @@ using __atomic_cuda_deduce_arithmetic = __atomic_cuda_operand_deduction, __atomic_cuda_operand_deduction>, _If<_CCCL_TRAIT(is_signed, _Type), - _If, - __atomic_cuda_operand_deduction>, // There is no atom.add.s64 - _If, - __atomic_cuda_operand_deduction>>>; + _If, + _If, + _If, + __atomic_cuda_operand_deduction>>>, // There is no + // atom.add.s64 + _If, + _If, + _If, + __atomic_cuda_operand_deduction>>>>>; template using __atomic_cuda_deduce_minmax = diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h index 4b58ba4901..73e5ef0044 100644 --- a/libcudacxx/include/cuda/std/__atomic/types.h +++ b/libcudacxx/include/cuda/std/__atomic/types.h @@ -32,10 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template struct __atomic_traits { - static constexpr bool __atomic_requires_lock = !__atomic_is_always_lock_free<_Tp>::__value; - static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4; - static constexpr bool __atomic_supports_reference = - __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8); + static constexpr bool __atomic_requires_lock = !__atomic_is_always_lock_free<_Tp>::__value; + static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4; + static constexpr bool __atomic_supports_reference = __atomic_is_always_lock_free<_Tp>::__value && sizeof(_Tp) <= 8; }; template diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt index 095fa41cf7..7aee09700a 100644 --- a/libcudacxx/test/atomic_codegen/CMakeLists.txt +++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt @@ -16,6 +16,11 @@ foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests) STATIC ${test_path} ) + set_target_properties( + atomic_codegen_${test_name} + PROPERTIES CUDA_ARCHITECTURES "80" + ) + ## Important for testing the local headers target_include_directories(atomic_codegen_${test_name} PRIVATE "${libcudacxx_SOURCE_DIR}/include") add_dependencies(libcudacxx.test.atomics.ptx atomic_codegen_${test_name}) diff --git a/libcudacxx/test/atomic_codegen/atomic_add_8b.cu b/libcudacxx/test/atomic_codegen/atomic_add_8b.cu new file mode 100644 index 0000000000..8ff6f624e3 --- /dev/null +++ b/libcudacxx/test/atomic_codegen/atomic_add_8b.cu @@ -0,0 +1,23 @@ +#include + +__global__ void add_relaxed_device_non_volatile(uint8_t* data, uint8_t* out, uint8_t n) +{ + auto ref = cuda::atomic_ref{*(data)}; + *out = ref.fetch_add(n, cuda::std::memory_order_relaxed); +} + +/* + +; SM8X-LABEL: .target sm_80 +; SM8X: .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]]( +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; +; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; +; SM8X-NEXT: // +; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; +; SM8X-NEXT: // +; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; +; SM8X-NEXT: ret; + +*/ From bfd97ee727ba42ea327cc0c077d7f582249d7ed9 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 25 Jul 2024 12:44:08 -0700 Subject: [PATCH 02/29] Add 16b test --- .../test/atomic_codegen/atomic_add_16b.cu | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 libcudacxx/test/atomic_codegen/atomic_add_16b.cu diff --git a/libcudacxx/test/atomic_codegen/atomic_add_16b.cu b/libcudacxx/test/atomic_codegen/atomic_add_16b.cu new file mode 100644 index 0000000000..95bc80e9a4 --- /dev/null +++ b/libcudacxx/test/atomic_codegen/atomic_add_16b.cu @@ -0,0 +1,23 @@ +#include + +__global__ void add_relaxed_device_non_volatile(uint16_t* data, uint16_t* out, uint16_t n) +{ + auto ref = cuda::atomic_ref{*(data)}; + *out = ref.fetch_add(n, cuda::std::memory_order_relaxed); +} + +/* + +; SM8X-LABEL: .target sm_80 +; SM8X: .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]]( +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; +; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; +; SM8X-NEXT: // +; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; +; SM8X-NEXT: // +; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; +; SM8X-NEXT: ret; + +*/ From 24bb639393a682350ffca445bb674a0f09af7e31 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 2 Aug 2024 15:53:45 -0700 Subject: [PATCH 03/29] Fix issues found when enabling 8/16b in a heterogeneous test, PTX seems to be invalid though --- .../std/__atomic/functions/cuda_ptx_derived.h | 77 +++++++++++++++++-- .../atomic/reference_cuda.pass.cpp | 6 +- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 25388a8273..d36f5a8777 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -40,12 +40,11 @@ template using __atomic_cuda_enable_non_native_load = typename enable_if<_Operand::__size <= 8, bool>::type; template = 0> -static inline _CCCL_DEVICE bool -__cuda_atomic_load(_Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) +static inline _CCCL_DEVICE void +__cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { uint16_t* __aligned = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1)); const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8; - const uint16_t __mask = ((1 << 8) - 1) << __offset; uint16_t __value = 0; @@ -67,8 +66,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( uint16_t __old = (uint16_t(__op) << __offset); uint16_t __old_value = 0; - bool __success = false; - // Reemit CAS instructions until either of two conditions are met while (1) { @@ -109,6 +106,9 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or template using __atomic_cuda_enable_non_native_add = typename enable_if<_Operand::__size <= 16, bool>::type; +template +using __atomic_cuda_enable_non_native_exchange = typename enable_if<_Operand::__size <= 8, bool>::type; + template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { @@ -122,6 +122,73 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __d _Sco{}); } +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old & __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old ^ __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old | __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void +__cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) +{ + // Store requires cas on 8b types + __cuda_atomic_fetch_update( + __ptr, + [__val](_Type __old) { + return __val; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + template _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco) { diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp index ddef76ec28..fbe613fe44 100644 --- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp +++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp @@ -175,19 +175,15 @@ using bitwise_atomic_testers = void kernel_invoker() { -// todo -#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL + // todo validate_pinned(); validate_pinned(); -#endif validate_pinned(); validate_pinned(); validate_pinned(); -#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL validate_pinned(); validate_pinned(); -#endif validate_pinned(); validate_pinned(); validate_pinned(); From f93e690c7ba7904f280599aea92702b9f9546918 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 7 Aug 2024 17:07:04 -0700 Subject: [PATCH 04/29] Remove 16b cas and use only 32b cas. --- .../codegen/generators/compare_and_swap.h | 1 - libcudacxx/codegen/generators/exchange.h | 1 - libcudacxx/codegen/generators/ld_st.h | 2 - .../std/__atomic/functions/cuda_ptx_derived.h | 319 +++------------ .../__atomic/functions/cuda_ptx_generated.h | 368 ------------------ 5 files changed, 57 insertions(+), 634 deletions(-) diff --git a/libcudacxx/codegen/generators/compare_and_swap.h b/libcudacxx/codegen/generators/compare_and_swap.h index 5a970735c0..17b11ab69c 100644 --- a/libcudacxx/codegen/generators/compare_and_swap.h +++ b/libcudacxx/codegen/generators/compare_and_swap.h @@ -83,7 +83,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( }; constexpr size_t supported_sizes[] = { - 16, 32, 64, 128, diff --git a/libcudacxx/codegen/generators/exchange.h b/libcudacxx/codegen/generators/exchange.h index dcfe66f147..f89a0075ae 100644 --- a/libcudacxx/codegen/generators/exchange.h +++ b/libcudacxx/codegen/generators/exchange.h @@ -81,7 +81,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange( }; constexpr size_t supported_sizes[] = { - 16, 32, 64, 128, diff --git a/libcudacxx/codegen/generators/ld_st.h b/libcudacxx/codegen/generators/ld_st.h index d4aec3da54..43cf80ef5c 100644 --- a/libcudacxx/codegen/generators/ld_st.h +++ b/libcudacxx/codegen/generators/ld_st.h @@ -92,7 +92,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_load( {{ asm volatile("ld{8}{4}{6}.{0}{1} %0,[%1];" : "={2}"(__dst) : "l"(__ptr) : "memory"); }})XXX"; constexpr size_t supported_sizes[] = { - 16, 32, 64, 128, @@ -249,7 +248,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_store( {{ asm volatile("st{8}{4}{6}.{0}{1} [%0],%1;" :: "l"(__ptr), "{2}"(__val) : "memory"); }})XXX"; constexpr size_t supported_sizes[] = { - 16, 32, 64, 128, diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index d36f5a8777..710362864c 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -37,43 +37,46 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD #if defined(_CCCL_CUDA_COMPILER) template -using __atomic_cuda_enable_non_native_load = typename enable_if<_Operand::__size <= 8, bool>::type; +using __atomic_cuda_enable_non_native_size = typename enable_if<_Operand::__size <= 16, bool>::type; -template = 0> +template +using __atomic_cuda_enable_native_size = typename enable_if<_Operand::__size >= 32, bool>::type; + +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { - uint16_t* __aligned = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1)); - const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8; + uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1)); + const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8; - uint16_t __value = 0; + uint32_t __value = 0; - __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}, __atomic_cuda_mmio_disable{}); + __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{}); __dst = static_cast<_Type>(__value >> __offset); } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, __atomic_cuda_operand_b8, _Sco) +template = 0> +static inline _CCCL_DEVICE bool +__cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco) { - uint16_t* __aligned = (uint16_t*) ((intptr_t) __ptr & ~(sizeof(uint16_t) - 1)); - const uint16_t __offset = uint16_t((intptr_t) __ptr & (sizeof(uint16_t) - 1)) * 8; - const uint16_t __mask = ((1 << 8) - 1) << __offset; + uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1)); + const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8; + const uint32_t __mask = ((1 << (sizeof(_Type) * 8)) - 1) << __offset; - // Algorithm for 8b CAS with 16b intrinsics - // __old = __window[0:16] where [__cmp] resides in either of the two 8b offsets + // Algorithm for 8b CAS with 32b intrinsics + // __old = __window[0:32] where [__cmp] resides within any of the potential offsets // First CAS attempt 'guesses' that the masked portion of the window is 0x00. - uint16_t __old = (uint16_t(__op) << __offset); - uint16_t __old_value = 0; + uint32_t __old = (uint32_t(__op) << __offset); + uint32_t __old_value = 0; // Reemit CAS instructions until either of two conditions are met while (1) { // Combine the desired value and most recently fetched expected masked portion of the window - uint16_t __attempt = (__old & ~__mask) | (uint16_t(__op) << __offset); + uint32_t __attempt = (__old & ~__mask) | (uint32_t(__op) << __offset); if (__cuda_atomic_compare_exchange( - __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b16{}, _Sco{})) + __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{})) { // CAS was successful return true; @@ -94,7 +97,7 @@ template _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco) { _Type __expected = 0; - __cuda_atomic_load(__ptr, __expected, __atomic_cuda_relaxed{}, _Operand{}, _Sco{}, __atomic_cuda_mmio_disable{}); + __atomic_load_cuda(__ptr, __expected, __ATOMIC_RELAXED, _Sco{}); _Type __desired = __op(__expected); while (!__cuda_atomic_compare_exchange(__ptr, __expected, __expected, __desired, _Order{}, _Operand{}, _Sco{})) { @@ -103,13 +106,7 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or return __expected; } -template -using __atomic_cuda_enable_non_native_add = typename enable_if<_Operand::__size <= 16, bool>::type; - -template -using __atomic_cuda_enable_non_native_exchange = typename enable_if<_Operand::__size <= 8, bool>::type; - -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -122,7 +119,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __d _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -135,7 +132,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __d _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -148,7 +145,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __d _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -161,7 +158,32 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __ds _Sco{}); } -template = 0> +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old < __op ? __old : __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old < __op ? __old : __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -174,7 +196,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __ds _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { @@ -254,7 +276,7 @@ _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __ } template = 0> -_CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) +_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) { return __atomic_fetch_update_cuda( __ptr, @@ -265,7 +287,7 @@ _CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder _Sco{}); } template = 0> -_CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) +_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) { return __atomic_fetch_update_cuda( __ptr, @@ -277,7 +299,7 @@ _CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int _ } template = 0> -_CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) +_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) { return __atomic_fetch_update_cuda( __ptr, @@ -288,7 +310,7 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorde _Sco{}); } template = 0> -_CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) +_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) { return __atomic_fetch_update_cuda( __ptr, @@ -299,233 +321,6 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int _Sco{}); } -// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -// _CCCL_DEVICE bool __atomic_compare_exchange_cuda( -// void volatile* __ptr, -// _Tp* __expected, -// const _Tp __desired, -// bool __weak, -// int __success_memorder, -// int __failure_memorder, -// _Sco) -// { -// using __proxy_t = _If; -// __proxy_t __old = 0; -// __proxy_t __new = 0; -// memcpy(&__old, __expected, sizeof(__proxy_t)); -// memcpy(&__new, &__desired, sizeof(__proxy_t)); -// bool __result = -// __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{}); -// memcpy(__expected, &__old, sizeof(__proxy_t)); -// return __result; -// } -// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -// _CCCL_DEVICE bool __atomic_compare_exchange_cuda( -// void* __ptr, _Tp* __expected, const _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, -// _Sco) -// { -// using __proxy_t = _If; -// __proxy_t __old = 0; -// __proxy_t __new = 0; -// memcpy(&__old, __expected, sizeof(__proxy_t)); -// memcpy(&__new, &__desired, sizeof(__proxy_t)); -// bool __result = -// __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{}); -// memcpy(__expected, &__old, sizeof(__proxy_t)); -// return __result; -// } -// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -// _CCCL_DEVICE void __atomic_exchange_cuda(void volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) -// { -// using __proxy_t = _If; -// __proxy_t __old = 0; -// __proxy_t __new = 0; -// memcpy(&__new, __val, sizeof(__proxy_t)); -// __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{}); -// memcpy(__ret, &__old, sizeof(__proxy_t)); -// } -// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -// _CCCL_DEVICE void __atomic_exchange_cuda(void* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) -// { -// using __proxy_t = _If; -// __proxy_t __old = 0; -// __proxy_t __new = 0; -// memcpy(&__new, __val, sizeof(__proxy_t)); -// __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{}); -// memcpy(__ret, &__old, sizeof(__proxy_t)); -// } - -// template = 0> -// _CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{})) -// ; -// *__ret = __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected + __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected + __val; -// } -// return __expected; -// } - -// template ::value, int> = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected > __val ? __expected : __val; - -// while (__desired == __val -// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected > __val ? __expected : __val; -// } - -// return __expected; -// } -// template ::value, int> = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected > __val ? __expected : __val; - -// while (__desired == __val -// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected > __val ? __expected : __val; -// } - -// return __expected; -// } - -// template ::value, int> = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected < __val ? __expected : __val; - -// while (__desired == __val -// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected < __val ? __expected : __val; -// } - -// return __expected; -// } -// template ::value, int> = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected < __val ? __expected : __val; - -// while (__desired == __val -// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected < __val ? __expected : __val; -// } - -// return __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected - __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected - __val; -// } -// return __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected & __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected & __val; -// } -// return __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected ^ __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected ^ __val; -// } -// return __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected | __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected | __val; -// } -// return __expected; -// } - -// template -// _CCCL_DEVICE bool __atomic_compare_exchange_n_cuda( -// _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, -// _Sco) -// { -// return __atomic_compare_exchange_cuda( -// __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{}); -// } - -// template -// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco) -// { -// _Tp __ret; -// __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{}); -// return __ret; -// } -// template -// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco) -// { -// _Tp __ret; -// __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{}); -// return __ret; -// } - _CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int) { asm volatile("" ::: "memory"); diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h index e72144b68c..3b164cfc52 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h @@ -116,162 +116,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_load_memory_order_dispatch(_Fn &__ ) } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) -{ asm volatile("ld.mmio.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) -{ asm volatile("ld.mmio.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.acquire.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) -{ asm volatile("ld.mmio.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_load( - const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } template static inline _CCCL_DEVICE void __cuda_atomic_load( const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) @@ -872,58 +716,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_store_memory_order_dispatch(_Fn &_ ) } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.release.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.release.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.release.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.release.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.relaxed.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.relaxed.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.relaxed.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) -{ asm volatile("st.mmio.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_store( - _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) -{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } template static inline _CCCL_DEVICE void __cuda_atomic_store( _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) @@ -1220,86 +1012,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_swap_memory_order_dispatch return __res; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.acquire.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.acquire.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.acquire.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.acquire.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.relaxed.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.relaxed.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.relaxed.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.relaxed.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.release.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.release.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.release.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.release.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.acq_rel.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.acq_rel.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.acq_rel.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.acq_rel.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } template static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag) @@ -1763,86 +1475,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange_memory_order_dispatch(_Fn ) } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.acquire.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.acquire.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.acquire.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.acquire.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.relaxed.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.relaxed.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.relaxed.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.relaxed.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.release.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.release.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.release.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.release.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.acq_rel.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.acq_rel.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.acq_rel.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.acq_rel.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } template static inline _CCCL_DEVICE void __cuda_atomic_exchange( _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag) From ae24c8ecba44dee1a3794763af06d48e8e6fb867 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 8 Aug 2024 14:48:36 -0700 Subject: [PATCH 05/29] Get several tests passing for 8/16b atomics --- .../std/__atomic/functions/cuda_ptx_derived.h | 40 +++++++++---------- .../functions/cuda_ptx_generated_helper.h | 12 +----- 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 710362864c..accbe65243 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -13,11 +13,6 @@ #include -#include - -#include "cuda/std/__atomic/functions/cuda_ptx_generated_helper.h" -#include "cuda_ptx_generated.h" - #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -37,17 +32,18 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD #if defined(_CCCL_CUDA_COMPILER) template -using __atomic_cuda_enable_non_native_size = typename enable_if<_Operand::__size <= 16, bool>::type; +using __cuda_atomic_enable_non_native_arithmetic = + typename enable_if<_Operand::__size <= 16 || _Operand::__op == __atomic_cuda_operand::_f, bool>::type; template -using __atomic_cuda_enable_native_size = typename enable_if<_Operand::__size >= 32, bool>::type; +using __cuda_atomic_enable_non_native_bitwise = typename enable_if<_Operand::__size <= 16, bool>::type; -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1)); - const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8; + const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * (sizeof(_Type) * 8); uint32_t __value = 0; @@ -55,7 +51,7 @@ __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __a __dst = static_cast<_Type>(__value >> __offset); } -template = 0> +template = 0> static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco) { @@ -66,8 +62,8 @@ __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __ // Algorithm for 8b CAS with 32b intrinsics // __old = __window[0:32] where [__cmp] resides within any of the potential offsets // First CAS attempt 'guesses' that the masked portion of the window is 0x00. - uint32_t __old = (uint32_t(__op) << __offset); - uint32_t __old_value = 0; + uint32_t __old = (uint32_t(__op) << __offset); + _Type __old_value = 0; // Reemit CAS instructions until either of two conditions are met while (1) @@ -81,7 +77,7 @@ __cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __ // CAS was successful return true; } - __old_value = (__old & __mask) >> __offset; + __old_value = static_cast<_Type>((__old & __mask) >> __offset); // The expected value no longer matches inside the CAS. if (__old_value != __cmp) { @@ -106,7 +102,7 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or return __expected; } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -119,7 +115,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __d _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -132,7 +128,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __d _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -145,7 +141,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __d _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -158,7 +154,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __ds _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -170,20 +166,20 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __d __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( __ptr, [__op](_Type __old) { - return __old < __op ? __old : __op; + return __old > __op ? __old : __op; }, _Order{}, __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { __dst = __cuda_atomic_fetch_update( @@ -196,7 +192,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __ds _Sco{}); } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h index cc3f3f4659..f806d3d0d8 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h @@ -144,14 +144,7 @@ using __atomic_cuda_deduce_arithmetic = __atomic_cuda_operand_deduction>>>>>; template -using __atomic_cuda_deduce_minmax = - _If<_CCCL_TRAIT(is_signed, _Type), - _If, - __atomic_cuda_operand_deduction>, - _If, - __atomic_cuda_operand_deduction>>; +using __atomic_cuda_deduce_minmax = __atomic_cuda_deduce_arithmetic<_Type>; template using __atomic_enable_if_native_bitwise = bool; @@ -159,9 +152,6 @@ using __atomic_enable_if_native_bitwise = bool; template using __atomic_enable_if_native_arithmetic = typename enable_if<_CCCL_TRAIT(is_scalar, _Type), bool>::type; -template -using __atomic_enable_if_not_native_arithmetic = typename enable_if::type; - template using __atomic_enable_if_native_minmax = typename enable_if<_CCCL_TRAIT(is_integral, _Type), bool>::type; From 1af8667d5e018d5f562edca26954f2c46cfc6155 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 16 Aug 2024 15:32:05 -0700 Subject: [PATCH 06/29] Remove todo and ifdefs from tests covering 8b/16b atomics --- .../libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp | 1 - .../libcudacxx/heterogeneous/atomic/reference_std.pass.cpp | 5 ----- 2 files changed, 6 deletions(-) diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp index fbe613fe44..4cae377b27 100644 --- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp +++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp @@ -175,7 +175,6 @@ using bitwise_atomic_testers = void kernel_invoker() { - // todo validate_pinned(); validate_pinned(); validate_pinned(); diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp index 9da187cee7..366b93b82f 100644 --- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp +++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp @@ -167,19 +167,14 @@ using bitwise_atomic_testers = void kernel_invoker() { -// todo -#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL validate_pinned(); validate_pinned(); -#endif validate_pinned(); validate_pinned(); validate_pinned(); -#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL validate_pinned(); validate_pinned(); -#endif validate_pinned(); validate_pinned(); validate_pinned(); From ff06fa141909a65304818cb94eef0101dc57b128 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 16 Aug 2024 15:33:22 -0700 Subject: [PATCH 07/29] Fix bug in 16b atomic load --- .../include/cuda/std/__atomic/functions/cuda_ptx_derived.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index accbe65243..3c7b297fea 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -43,7 +43,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1)); - const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * (sizeof(_Type) * 8); + const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8; uint32_t __value = 0; From 8daaad13d3d3f3d5abf0b8ba3e85334b7f14d2ae Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 16 Aug 2024 15:34:31 -0700 Subject: [PATCH 08/29] Move store close to fetch_update since it is a derived primitive --- .../std/__atomic/functions/cuda_ptx_derived.h | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 3c7b297fea..f3009fefb6 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -102,6 +102,22 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or return __expected; } +template = 0> +static inline _CCCL_DEVICE void +__cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) +{ + printf("%s\r\n", __PRETTY_FUNCTION__); + // Store requires cas on 8/16b types + __cuda_atomic_fetch_update( + __ptr, + [__val](_Type __old) { + return __val; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { @@ -157,6 +173,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __ds template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { + printf("%s\r\n", __PRETTY_FUNCTION__); __dst = __cuda_atomic_fetch_update( __ptr, [__op](_Type __old) { @@ -192,21 +209,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __ds _Sco{}); } -template = 0> -static inline _CCCL_DEVICE void -__cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) -{ - // Store requires cas on 8b types - __cuda_atomic_fetch_update( - __ptr, - [__val](_Type __old) { - return __val; - }, - _Order{}, - __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, - _Sco{}); -} - template _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco) { @@ -274,6 +276,7 @@ _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __ template = 0> _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) { + printf("%s\r\n", __PRETTY_FUNCTION__); return __atomic_fetch_update_cuda( __ptr, [__val](_Tp __old) { @@ -285,6 +288,7 @@ _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, template = 0> _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) { + printf("%s\r\n", __PRETTY_FUNCTION__); return __atomic_fetch_update_cuda( __ptr, [__val](_Tp __old) { From f588332a782c1f3c111d6722eb2d6afdc4440d52 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 16 Aug 2024 15:35:35 -0700 Subject: [PATCH 09/29] Fix bug in minmax due to s64 overload missing for arithmetic types --- .../functions/cuda_ptx_generated_helper.h | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h index f806d3d0d8..9b81adff18 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h @@ -144,7 +144,27 @@ using __atomic_cuda_deduce_arithmetic = __atomic_cuda_operand_deduction>>>>>; template -using __atomic_cuda_deduce_minmax = __atomic_cuda_deduce_arithmetic<_Type>; +using __atomic_cuda_deduce_minmax = + _If<_CCCL_TRAIT(is_floating_point, _Type), + _If, + __atomic_cuda_operand_deduction>, + _If<_CCCL_TRAIT(is_signed, _Type), + _If, + _If, + _If, + __atomic_cuda_operand_deduction>>>, // atom.min|max.s64 + // supported + _If, + _If, + _If, + __atomic_cuda_operand_deduction>>>>>; template using __atomic_enable_if_native_bitwise = bool; From 60e8c256a1b2b63b6b6511866899fdc8d9bd9847 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 16 Aug 2024 15:36:53 -0700 Subject: [PATCH 10/29] Add more 8/16b tests for atomic_ref --- .../std/atomics/atomics.types.generic/integral_ref.pass.cpp | 4 ++++ .../atomics.types.generic/integral_ref_constness.pass.cpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp index 56153f3664..5ef493b6dd 100644 --- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp @@ -177,6 +177,10 @@ __host__ __device__ void test_for_all_types() #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS test, char32_t, Selector>(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS + test, int8_t, Selector>(); + test, uint8_t, Selector>(); + test, int16_t, Selector>(); + test, uint16_t, Selector>(); test, int32_t, Selector>(); test, uint32_t, Selector>(); test, int64_t, Selector>(); diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp index b237c862a5..57c4a10f53 100644 --- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp @@ -177,6 +177,10 @@ __host__ __device__ void test_for_all_types() #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS test, char32_t, Selector>(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS + test, int8_t, Selector>(); + test, uint8_t, Selector>(); + test, int16_t, Selector>(); + test, uint16_t, Selector>(); test, int32_t, Selector>(); test, uint32_t, Selector>(); test, int64_t, Selector>(); From e9a79f4d84a6350179fbfe87188ce2cb3db131ed Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 16 Aug 2024 16:34:52 -0700 Subject: [PATCH 11/29] Fixup remove debug prints --- .../include/cuda/std/__atomic/functions/cuda_ptx_derived.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index f3009fefb6..cc795e9495 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -106,7 +106,6 @@ template = 0> static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) { - printf("%s\r\n", __PRETTY_FUNCTION__); __dst = __cuda_atomic_fetch_update( __ptr, [__op](_Type __old) { @@ -276,7 +274,6 @@ _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __ template = 0> _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) { - printf("%s\r\n", __PRETTY_FUNCTION__); return __atomic_fetch_update_cuda( __ptr, [__val](_Tp __old) { @@ -288,7 +285,6 @@ _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, template = 0> _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) { - printf("%s\r\n", __PRETTY_FUNCTION__); return __atomic_fetch_update_cuda( __ptr, [__val](_Tp __old) { From b713109116180fda73b339c1d0fae398f3bf7aca Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 16 Aug 2024 17:08:40 -0700 Subject: [PATCH 12/29] Cleanup bitmask hell, fix bug where lower mask was ignored --- .../std/__atomic/functions/cuda_ptx_derived.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index cc795e9495..c62fede58e 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -55,21 +55,22 @@ template ((__old & __mask) >> __offset); + auto __old_value = static_cast<_Type>((__old & __valueMask) >> __offset); // The expected value no longer matches inside the CAS. if (__old_value != __cmp) { From 1c2627d57643f15a6aca66a99ec7521957d02579 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 16 Aug 2024 17:09:30 -0700 Subject: [PATCH 13/29] Add test covering interleaved CAS onto same atomic window --- .../cuda/atomics/atomic_ref_small.pass.cpp | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp new file mode 100644 index 0000000000..d012c2f02b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads, pre-sm-60 +// UNSUPPORTED: windows && pre-sm-70 + +// + +#include +#include +#include + +#include "test_macros.h" + +/* +Test goals: +Interleaved 8b/16b access to a 32b window while there is thread contention. + +for 8b: +Launch 1028 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption +caused by interleaved access to different parts of the window. + +for 16b: +Launch 1028 threads, fetch_add(128) into both windows. +*/ + +template +__host__ __device__ void fetch_add_into_window(T* window, uint16_t* atomHistory) +{ + typedef cuda::atomic_ref Atom; + + Atom a(*window); + *atomHistory = a.fetch_add(Inc); +} + +template +__device__ void device_do_test(uint32_t expected) +{ + __shared__ uint16_t atomHistory[1024]; + __shared__ uint32_t atomicStorage; + cuda::atomic_ref bucket(atomicStorage); + + constexpr uint32_t offsetMask = ((4 / sizeof(T)) - 1); + // Access offset is interleaved meaning threads 4, 5, 6, 7 access window 0, 1, 2, 3 and so on. + const uint32_t threadOffset = threadIdx.x & offsetMask; + + if (threadIdx.x == 0) + { + bucket.store(0); + } + __syncthreads(); + + T* window = reinterpret_cast(&atomicStorage) + threadOffset; + fetch_add_into_window(window, atomHistory + threadIdx.x); + + __syncthreads(); + if (threadIdx.x == 0) + { + printf("expected: 0x%X\r\n", expected); + printf("result: 0x%X\r\n", bucket.load()); + assert(bucket.load() == expected); + } +} + +int main(int, char**) +{ + NV_DISPATCH_TARGET(NV_IS_HOST, + (cuda_thread_count = 1020;), + NV_IS_DEVICE, + (device_do_test(~uint32_t(0)); device_do_test(0xFF00FF00);)); + + return 0; +} From c0f52c8afe53c59ddb6e05d57ed8f767cbd9fea3 Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Fri, 16 Aug 2024 18:06:32 -0700 Subject: [PATCH 14/29] Fixup documentation mistake. --- .../test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp index d012c2f02b..448215b3a6 100644 --- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp @@ -22,11 +22,11 @@ Test goals: Interleaved 8b/16b access to a 32b window while there is thread contention. for 8b: -Launch 1028 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption +Launch 1020 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption caused by interleaved access to different parts of the window. for 16b: -Launch 1028 threads, fetch_add(128) into both windows. +Launch 1020 threads, fetch_add(128) into both windows. */ template From 6d1beecaece3eee3968fb84f3f1f2a9d98c05c22 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 19 Aug 2024 11:15:05 -0700 Subject: [PATCH 15/29] Make atomics enable_if uses match rest of libcudacxx. --- .../std/__atomic/functions/cuda_ptx_derived.h | 4 +- .../functions/cuda_ptx_generated_helper.h | 6 +- .../std/__atomic/platform/msvc_to_builtins.h | 66 +++++++++---------- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index c62fede58e..0b9b3860a6 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -33,10 +33,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template using __cuda_atomic_enable_non_native_arithmetic = - typename enable_if<_Operand::__size <= 16 || _Operand::__op == __atomic_cuda_operand::_f, bool>::type; + __enable_if_t<_Operand::__size <= 16 || _Operand::__op == __atomic_cuda_operand::_f, bool>; template -using __cuda_atomic_enable_non_native_bitwise = typename enable_if<_Operand::__size <= 16, bool>::type; +using __cuda_atomic_enable_non_native_bitwise = __enable_if_t<_Operand::__size <= 16, bool>; template = 0> static inline _CCCL_DEVICE void diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h index 9b81adff18..9a4c2d5bfc 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h @@ -170,13 +170,13 @@ template using __atomic_enable_if_native_bitwise = bool; template -using __atomic_enable_if_native_arithmetic = typename enable_if<_CCCL_TRAIT(is_scalar, _Type), bool>::type; +using __atomic_enable_if_native_arithmetic = __enable_if_t<_CCCL_TRAIT(is_scalar, _Type), bool>; template -using __atomic_enable_if_native_minmax = typename enable_if<_CCCL_TRAIT(is_integral, _Type), bool>::type; +using __atomic_enable_if_native_minmax = __enable_if_t<_CCCL_TRAIT(is_integral, _Type), bool>; template -using __atomic_enable_if_not_native_minmax = typename enable_if::type; +using __atomic_enable_if_not_native_minmax = __enable_if_t; _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h index 8afa9756ef..33d11fd9ad 100644 --- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h +++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h @@ -72,9 +72,9 @@ static inline void __atomic_thread_fence(int __memorder) } template -using _enable_if_sized_as = typename enable_if::type; +using __enable_if_sized_as = __enable_if_t; -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) { # ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN @@ -84,7 +84,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) # endif *__ret = reinterpret_cast<_Type&>(__tmp); } -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) { # ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN @@ -94,7 +94,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) # endif *__ret = reinterpret_cast<_Type&>(__tmp); } -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) { # ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN @@ -104,7 +104,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) # endif *__ret = reinterpret_cast<_Type&>(__tmp); } -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) { # ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN @@ -136,7 +136,7 @@ void __atomic_load(const volatile _Type* __ptr, _Type* __ret, int __memorder) } } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) { auto __t = reinterpret_cast<__int8*>(__val); @@ -147,7 +147,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) __iso_volatile_store8(__d, *__t); # endif } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) { auto __t = reinterpret_cast<__int16*>(__val); @@ -158,7 +158,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) __iso_volatile_store16(__d, *__t); # endif } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) { auto __t = reinterpret_cast<__int32*>(__val); @@ -170,7 +170,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) __iso_volatile_store32(__d, *__t); # endif } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) { auto __t = reinterpret_cast<__int64*>(__val); @@ -202,7 +202,7 @@ void __atomic_store(volatile _Type* __ptr, _Type* __val, int __memorder) } } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired) { auto __tmp_desired = reinterpret_cast(*__desired); @@ -215,7 +215,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp *__expected = reinterpret_cast(__old); return false; } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired) { auto __tmp_desired = reinterpret_cast(*__desired); @@ -228,7 +228,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp *__expected = reinterpret_cast(__old); return false; } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired) { auto __tmp_desired = reinterpret_cast(*__desired); @@ -241,7 +241,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp *__expected = reinterpret_cast(__old); return false; } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired) { auto __tmp_desired = reinterpret_cast(*__desired); @@ -287,25 +287,25 @@ bool __atomic_compare_exchange( return success; } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret) { auto const __old = _InterlockedExchange8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret) { auto const __old = _InterlockedExchange16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret) { auto const __old = _InterlockedExchange((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret) { auto const __old = _InterlockedExchange64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); @@ -341,25 +341,25 @@ void __atomic_exchange(_Type volatile* __ptr, const _Type* __val, _Type* __ret, } } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedExchangeAdd8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedExchangeAdd16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedExchangeAdd((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedExchangeAdd64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); @@ -404,25 +404,25 @@ _Type __atomic_fetch_sub(_Type volatile* __ptr, _Delta __val, int __memorder) return __atomic_fetch_add(__ptr, 0 - __val, __memorder); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedAnd8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedAnd16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedAnd((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedAnd64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); @@ -462,25 +462,25 @@ _Type __atomic_fetch_and(_Type volatile* __ptr, _Delta __val, int __memorder) return *__dest; } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedXor8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedXor16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedXor((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedXor64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); @@ -520,25 +520,25 @@ _Type __atomic_fetch_xor(_Type volatile* __ptr, _Delta __val, int __memorder) return *__dest; } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedOr8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedOr16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedOr((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedOr64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); From d5f89285e131093c2089d4b59a15219cf167d715 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Mon, 19 Aug 2024 15:32:42 -0700 Subject: [PATCH 16/29] Verify fetch_add sequential load behavior in 8b/16b atomics --- .../cuda/atomics/atomic_ref_small.pass.cpp | 32 +++++++++++++++---- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp index 448215b3a6..e1f62d856f 100644 --- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp @@ -22,11 +22,11 @@ Test goals: Interleaved 8b/16b access to a 32b window while there is thread contention. for 8b: -Launch 1020 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption +Launch 1024 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption caused by interleaved access to different parts of the window. for 16b: -Launch 1020 threads, fetch_add(128) into both windows. +Launch 1024 threads, fetch_add(1), checking for 0x01FF01FF. */ template @@ -38,11 +38,17 @@ __host__ __device__ void fetch_add_into_window(T* window, uint16_t* atomHistory) *atomHistory = a.fetch_add(Inc); } -template +template __device__ void device_do_test(uint32_t expected) { - __shared__ uint16_t atomHistory[1024]; + constexpr uint32_t threadCount = 1024; + constexpr uint32_t histogramResultCount = 256 * sizeof(T); + constexpr uint32_t histogramEntriesPerThread = 4 / sizeof(T); + + __shared__ uint16_t atomHistory[threadCount]; + __shared__ uint8_t atomHistogram[histogramResultCount]; __shared__ uint32_t atomicStorage; + cuda::atomic_ref bucket(atomicStorage); constexpr uint32_t offsetMask = ((4 / sizeof(T)) - 1); @@ -51,16 +57,28 @@ __device__ void device_do_test(uint32_t expected) if (threadIdx.x == 0) { + memset(atomHistogram, 0, histogramResultCount); bucket.store(0); } __syncthreads(); T* window = reinterpret_cast(&atomicStorage) + threadOffset; - fetch_add_into_window(window, atomHistory + threadIdx.x); + fetch_add_into_window(window, atomHistory + threadIdx.x); __syncthreads(); if (threadIdx.x == 0) { + // For each thread, add its atomic result into the corresponding bucket + for (int i = 0; i < threadCount; i++) + { + atomHistogram[atomHistory[i]]++; + } + // Check that each bucket has exactly (4 / sizeof(T)) entries + // This checks that atomic fetch operations were sequential. i.e. 4xfetch_add(1) returns [0, 1, 2, 3] + for (int i = 0; i < histogramResultCount; i++) + { + assert(atomHistogram[i] == histogramEntriesPerThread); + } printf("expected: 0x%X\r\n", expected); printf("result: 0x%X\r\n", bucket.load()); assert(bucket.load() == expected); @@ -70,9 +88,9 @@ __device__ void device_do_test(uint32_t expected) int main(int, char**) { NV_DISPATCH_TARGET(NV_IS_HOST, - (cuda_thread_count = 1020;), + (cuda_thread_count = 1024;), NV_IS_DEVICE, - (device_do_test(~uint32_t(0)); device_do_test(0xFF00FF00);)); + (device_do_test(0); device_do_test(0x02000200);)); return 0; } From c9ca506b2747f1d94129fdd94ab876e4794947ce Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 20 Aug 2024 12:59:09 -0700 Subject: [PATCH 17/29] Remove 8b/16b add PTX tests --- .../test/atomic_codegen/atomic_add_16b.cu | 23 ------------------- .../test/atomic_codegen/atomic_add_8b.cu | 23 ------------------- 2 files changed, 46 deletions(-) delete mode 100644 libcudacxx/test/atomic_codegen/atomic_add_16b.cu delete mode 100644 libcudacxx/test/atomic_codegen/atomic_add_8b.cu diff --git a/libcudacxx/test/atomic_codegen/atomic_add_16b.cu b/libcudacxx/test/atomic_codegen/atomic_add_16b.cu deleted file mode 100644 index 95bc80e9a4..0000000000 --- a/libcudacxx/test/atomic_codegen/atomic_add_16b.cu +++ /dev/null @@ -1,23 +0,0 @@ -#include - -__global__ void add_relaxed_device_non_volatile(uint16_t* data, uint16_t* out, uint16_t n) -{ - auto ref = cuda::atomic_ref{*(data)}; - *out = ref.fetch_add(n, cuda::std::memory_order_relaxed); -} - -/* - -; SM8X-LABEL: .target sm_80 -; SM8X: .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; -; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; -; SM8X-NEXT: // -; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; -; SM8X-NEXT: ret; - -*/ diff --git a/libcudacxx/test/atomic_codegen/atomic_add_8b.cu b/libcudacxx/test/atomic_codegen/atomic_add_8b.cu deleted file mode 100644 index 8ff6f624e3..0000000000 --- a/libcudacxx/test/atomic_codegen/atomic_add_8b.cu +++ /dev/null @@ -1,23 +0,0 @@ -#include - -__global__ void add_relaxed_device_non_volatile(uint8_t* data, uint8_t* out, uint8_t n) -{ - auto ref = cuda::atomic_ref{*(data)}; - *out = ref.fetch_add(n, cuda::std::memory_order_relaxed); -} - -/* - -; SM8X-LABEL: .target sm_80 -; SM8X: .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; -; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; -; SM8X-NEXT: // -; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; -; SM8X-NEXT: ret; - -*/ From 6f5d0b8b49f95d194b3f3f08ac3ce09489c11d2d Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 20 Aug 2024 18:10:36 -0700 Subject: [PATCH 18/29] Optimize fetch_update CAS loops --- .../std/__atomic/functions/cuda_ptx_derived.h | 107 ++++++++++++++---- 1 file changed, 84 insertions(+), 23 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 0b9b3860a6..06e965d4a2 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -38,12 +38,16 @@ using __cuda_atomic_enable_non_native_arithmetic = template using __cuda_atomic_enable_non_native_bitwise = __enable_if_t<_Operand::__size <= 16, bool>; +template +using __cuda_atomic_enable_native_bitwise = __enable_if_t<_Operand::__size >= 32, bool>; + template = 0> static inline _CCCL_DEVICE void __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { - uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1)); - const uint32_t __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8; + constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1); + uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); + const uint8_t __offset = uint32_t((intptr_t) __ptr & __alignmask) * 8; uint32_t __value = 0; @@ -55,22 +59,30 @@ template ((__old & __valueMask) >> __offset); - // The expected value no longer matches inside the CAS. - if (__old_value != __cmp) + } + __dst = static_cast<_Type>(__old >> __offset); + return false; +} + +// Optimized fetch_update CAS loop with op determined after first load reducing waste. +template = 0> +_CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco) +{ + constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1); + constexpr uint32_t __sizemask = (1 << (sizeof(_Type) * 8)) - 1; + uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); + const uint8_t __offset = uint8_t((intptr_t) __ptr & __alignmask) * 8; + const uint32_t __valueMask = __sizemask << __offset; + const uint32_t __windowMask = ~__valueMask; + + // 8/16b fetch update is similar to CAS implementation, but compresses the logic for recalculating the operand + // __old = __window[0:32] where [__cmp] resides within some offset. + uint32_t __old; + NV_IF_TARGET( + NV_PROVIDES_SM_70, + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});), + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});)) + + // Reemit CAS instructions until we succeed + while (1) + { + // Calculate new desired value from last fetched __old + // Use of the value mask is required due to the possibility of overflow when ops are widened. Possible compiler bug? + const uint32_t __attempt = + ((static_cast(__op(static_cast<_Type>(__old >> __offset))) << __offset) & __valueMask) + | (__old & __windowMask); + + if (__cuda_atomic_compare_exchange( + __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{})) { - __dst = __old_value; - break; + // CAS was successful + return static_cast<_Type>(__old >> __offset); } } - return false; } -// Lower level fetch_update that bypasses memorder dispatch -template +// Optimized fetch_update CAS loop with op determined after first load reducing waste. +template = 0> _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco) { _Type __expected = 0; - __atomic_load_cuda(__ptr, __expected, __ATOMIC_RELAXED, _Sco{}); + NV_IF_TARGET( + NV_PROVIDES_SM_70, + (__cuda_atomic_load( + __ptr, __expected, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});), + (__cuda_atomic_load( + __ptr, __expected, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});)) + _Type __desired = __op(__expected); while (!__cuda_atomic_compare_exchange(__ptr, __expected, __expected, __desired, _Order{}, _Operand{}, _Sco{})) { From b7b944e3db7724167f6ed8ebab70b910b0e63e88 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 20 Aug 2024 18:10:58 -0700 Subject: [PATCH 19/29] Fix name of preset for PTX codegen test --- CMakePresets.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakePresets.json b/CMakePresets.json index 61cb88eca8..357c4da496 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -478,7 +478,7 @@ ], "filter": { "exclude": { - "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.diff)$" + "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.ptx)$" } } }, From 2381c4297290e936642ed36698dac97e02ecba97 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 21 Aug 2024 09:29:44 -0700 Subject: [PATCH 20/29] Fix signed/unsigned comparison --- .../test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp index e1f62d856f..ab86ca5222 100644 --- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp @@ -69,13 +69,13 @@ __device__ void device_do_test(uint32_t expected) if (threadIdx.x == 0) { // For each thread, add its atomic result into the corresponding bucket - for (int i = 0; i < threadCount; i++) + for (uint32_t i = 0; i < threadCount; i++) { atomHistogram[atomHistory[i]]++; } // Check that each bucket has exactly (4 / sizeof(T)) entries // This checks that atomic fetch operations were sequential. i.e. 4xfetch_add(1) returns [0, 1, 2, 3] - for (int i = 0; i < histogramResultCount; i++) + for (uint32_t i = 0; i < histogramResultCount; i++) { assert(atomHistogram[i] == histogramEntriesPerThread); } From 07d1077c41f1a2940bd8668687c0bb2ac9c325c0 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 21 Aug 2024 10:30:30 -0700 Subject: [PATCH 21/29] Fix atomics codegen tests not being built --- CMakePresets.json | 5 +++-- libcudacxx/test/CMakeLists.txt | 1 + libcudacxx/test/atomic_codegen/CMakeLists.txt | 2 -- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 357c4da496..10bdd83539 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -356,7 +356,8 @@ "libcudacxx.test.internal_headers", "libcudacxx.test.public_headers", "libcudacxx.test.public_headers_host_only", - "libcudacxx.test.lit.precompile" + "libcudacxx.test.lit.precompile", + "libcudacxx.test.atomics.ptx" ] }, { @@ -478,7 +479,7 @@ ], "filter": { "exclude": { - "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.ptx)$" + "name": "^libcudacxx\\.test\\.lit$" } } }, diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt index b20a0f3fed..2a7b4d7130 100644 --- a/libcudacxx/test/CMakeLists.txt +++ b/libcudacxx/test/CMakeLists.txt @@ -43,6 +43,7 @@ endif() find_program(filecheck "FileCheck") +add_custom_target(libcudacxx.test.atomics.ptx) if (filecheck) message("-- ${filecheck} found... building atomic codegen tests") add_subdirectory(atomic_codegen) diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt index 7aee09700a..7cc291a054 100644 --- a/libcudacxx/test/atomic_codegen/CMakeLists.txt +++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt @@ -1,5 +1,3 @@ -add_custom_target(libcudacxx.test.atomics.ptx) - find_program(filecheck "FileCheck" REQUIRED) find_program(cuobjdump "cuobjdump" REQUIRED) find_program(bash "bash" REQUIRED) From a2d19d13a1d052f8190b29aa318a10d1812679a5 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 21 Aug 2024 14:42:22 -0700 Subject: [PATCH 22/29] Fix CMake target for libcudacxx ptx tests. --- libcudacxx/test/CMakeLists.txt | 8 +------- libcudacxx/test/atomic_codegen/CMakeLists.txt | 11 ++++++++++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt index 2a7b4d7130..0b0f18a7c5 100644 --- a/libcudacxx/test/CMakeLists.txt +++ b/libcudacxx/test/CMakeLists.txt @@ -41,10 +41,4 @@ if (LIBCUDACXX_TEST_WITH_NVRTC) add_subdirectory(utils/nvidia/nvrtc) endif() -find_program(filecheck "FileCheck") - -add_custom_target(libcudacxx.test.atomics.ptx) -if (filecheck) - message("-- ${filecheck} found... building atomic codegen tests") - add_subdirectory(atomic_codegen) -endif() +add_subdirectory(atomic_codegen) diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt index 7cc291a054..ac674b09b5 100644 --- a/libcudacxx/test/atomic_codegen/CMakeLists.txt +++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt @@ -1,4 +1,13 @@ -find_program(filecheck "FileCheck" REQUIRED) +add_custom_target(libcudacxx.test.atomics.ptx) + +find_program(filecheck "FileCheck") + +if (filecheck) + message("-- ${filecheck} found... building atomic codegen tests") +else() + return() +endif() + find_program(cuobjdump "cuobjdump" REQUIRED) find_program(bash "bash" REQUIRED) From 70aa4a3ba07aa7125c3611d645c5bbcc871707b9 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Tue, 27 Aug 2024 10:15:28 -0700 Subject: [PATCH 23/29] Make dump_and_check executable again --- libcudacxx/test/atomic_codegen/dump_and_check.bash | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 libcudacxx/test/atomic_codegen/dump_and_check.bash diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash old mode 100644 new mode 100755 From e08ed80759fbb70f85dbe389ba055ed209af47c2 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 28 Aug 2024 13:41:01 -0700 Subject: [PATCH 24/29] Work around inconsistent parsing of [[[ in FileCheck versions --- libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu | 6 +++--- libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu | 6 +++--- libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu | 6 +++--- libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu | 4 ++-- libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu | 4 ++-- libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu | 6 +++--- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu index d97636d647..d4b3efe3b5 100644 --- a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu @@ -10,9 +10,9 @@ __global__ void add_relaxed_device_non_volatile(int* data, int* out, int n) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; ; SM8X-NEXT: // ; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; diff --git a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu index e0b6c3d151..b6c5ff4aeb 100644 --- a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu @@ -10,9 +10,9 @@ __global__ void cas_device_relaxed_non_volatile(int* data, int* out, int n) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*cas_device_relaxed_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; ; SM8X-NEXT: ld.global.u32 %r[[#LOCALEXP:]], [%rd[[#INPUT]]]; ; SM8X-NEXT: // diff --git a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu index 787fcd30bb..58a87c3f14 100644 --- a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu @@ -10,9 +10,9 @@ __global__ void exch_device_relaxed_non_volatile(int* data, int* out, int n) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*exch_device_relaxed_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; ; SM8X-NEXT: // ; SM8X-NEXT: atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; diff --git a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu index 6bef7f1c2a..42fea95303 100644 --- a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu @@ -10,8 +10,8 @@ __global__ void load_relaxed_device_non_volatile(int* data, int* out) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*load_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; ; SM8X-NEXT: // ; SM8X-NEXT: ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]]; diff --git a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu index 983c8e9fac..fca00e332a 100644 --- a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu @@ -10,8 +10,8 @@ __global__ void store_relaxed_device_non_volatile(int* data, int in) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*store_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_1]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_1{{.*}} ; SM8X-NEXT: // ; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]]; ; SM8X-NEXT: // diff --git a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu index 9d1ffaefa1..70b306b9de 100644 --- a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu @@ -10,9 +10,9 @@ __global__ void sub_relaxed_device_non_volatile(int* data, int* out, int n) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*sub_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; ; SM8X-NEXT: neg.s32 %r[[#NEG:]], %r[[#INPUT]]; ; SM8X-NEXT: // From 1000597d738cf165dc8ff5c8bcf285ed665f8850 Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:12:21 -0700 Subject: [PATCH 25/29] Make min/max match algorith.min/max. --- .../include/cuda/std/__atomic/functions/cuda_ptx_derived.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 06e965d4a2..93d4143129 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -249,7 +249,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __d __dst = __cuda_atomic_fetch_update( __ptr, [__op](_Type __old) { - return __old > __op ? __old : __op; + return __op < __old ? __old : __op; }, _Order{}, __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, From b7baeef9aab93fb1c1d51eb1aa9de4cc4dc37d4d Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 11 Sep 2024 13:16:29 -0700 Subject: [PATCH 26/29] Work around NVCC 11.X using different syntax for inline ptx --- libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu | 4 +--- libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu | 5 ++--- libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu | 4 +--- libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu | 4 +--- libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu | 4 +--- libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu | 4 +--- libcudacxx/test/atomic_codegen/dump_and_check.bash | 2 +- 7 files changed, 8 insertions(+), 19 deletions(-) diff --git a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu index d4b3efe3b5..9d1317e021 100644 --- a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu @@ -14,9 +14,7 @@ __global__ void add_relaxed_device_non_volatile(int* data, int* out, int n) ; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}} ; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu index b6c5ff4aeb..99c73f4c32 100644 --- a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu @@ -6,6 +6,7 @@ __global__ void cas_device_relaxed_non_volatile(int* data, int* out, int n) ref.compare_exchange_strong(*out, n, cuda::std::memory_order_relaxed); } +// clang-format off /* ; SM8X-LABEL: .target sm_80 @@ -15,9 +16,7 @@ __global__ void cas_device_relaxed_non_volatile(int* data, int* out, int n) ; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; ; SM8X-NEXT: ld.global.u32 %r[[#LOCALEXP:]], [%rd[[#INPUT]]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.cas.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#LOCALEXP]],%r[[#INPUT]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}atom.cas.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#LOCALEXP]],%r[[#INPUT]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu index 58a87c3f14..59c4d52d97 100644 --- a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu @@ -14,9 +14,7 @@ __global__ void exch_device_relaxed_non_volatile(int* data, int* out, int n) ; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}} ; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu index 42fea95303..804f953294 100644 --- a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu @@ -13,9 +13,7 @@ __global__ void load_relaxed_device_non_volatile(int* data, int* out) ; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} ; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; -; SM8X-NEXT: // -; SM8X-NEXT: ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu index fca00e332a..4dcae5d176 100644 --- a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu @@ -12,9 +12,7 @@ __global__ void store_relaxed_device_non_volatile(int* data, int in) ; SM8X: .visible .entry [[FUNCTION:_.*store_relaxed_device_non_volatile.*]]( ; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} ; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_1{{.*}} -; SM8X-NEXT: // -; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}} ; SM8X-NEXT: ret; */ diff --git a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu index 70b306b9de..b8a40fee97 100644 --- a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu @@ -15,9 +15,7 @@ __global__ void sub_relaxed_device_non_volatile(int* data, int* out, int n) ; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; ; SM8X-NEXT: neg.s32 %r[[#NEG:]], %r[[#INPUT]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash index a5c62e91bd..c66bb43363 100755 --- a/libcudacxx/test/atomic_codegen/dump_and_check.bash +++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash @@ -6,4 +6,4 @@ input_archive=$1 input_testfile=$2 input_prefix=$3 -cuobjdump --dump-ptx $input_archive | FileCheck --check-prefix $input_prefix $input_testfile +cuobjdump --dump-ptx $input_archive | FileCheck --match-full-lines --check-prefix $input_prefix $input_testfile From c59b3ab1c3eafc933aea0b7fe44af47adc859bae Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 11 Sep 2024 14:27:18 -0700 Subject: [PATCH 27/29] Fix warnings in the codegen tests. --- libcudacxx/test/atomic_codegen/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt index ac674b09b5..d881e823f8 100644 --- a/libcudacxx/test/atomic_codegen/CMakeLists.txt +++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt @@ -25,9 +25,12 @@ foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests) set_target_properties( atomic_codegen_${test_name} - PROPERTIES CUDA_ARCHITECTURES "80" + PROPERTIES + CUDA_ARCHITECTURES "80" ) + target_compile_options(atomic_codegen_${test_name} PRIVATE "-Wno-comment") + ## Important for testing the local headers target_include_directories(atomic_codegen_${test_name} PRIVATE "${libcudacxx_SOURCE_DIR}/include") add_dependencies(libcudacxx.test.atomics.ptx atomic_codegen_${test_name}) From 642b4870bfea9f245b287dca2282cc3fd13a9632 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 11 Sep 2024 14:28:10 -0700 Subject: [PATCH 28/29] Use PTX 16b ld/st instead of 32b CAS --- libcudacxx/codegen/generators/ld_st.h | 2 + .../std/__atomic/functions/cuda_ptx_derived.h | 10 +- .../__atomic/functions/cuda_ptx_generated.h | 208 ++++++++++++++++++ 3 files changed, 218 insertions(+), 2 deletions(-) diff --git a/libcudacxx/codegen/generators/ld_st.h b/libcudacxx/codegen/generators/ld_st.h index 43cf80ef5c..d4aec3da54 100644 --- a/libcudacxx/codegen/generators/ld_st.h +++ b/libcudacxx/codegen/generators/ld_st.h @@ -92,6 +92,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_load( {{ asm volatile("ld{8}{4}{6}.{0}{1} %0,[%1];" : "={2}"(__dst) : "l"(__ptr) : "memory"); }})XXX"; constexpr size_t supported_sizes[] = { + 16, 32, 64, 128, @@ -248,6 +249,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_store( {{ asm volatile("st{8}{4}{6}.{0}{1} [%0],%1;" :: "l"(__ptr), "{2}"(__val) : "memory"); }})XXX"; constexpr size_t supported_sizes[] = { + 16, 32, 64, 128, diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 93d4143129..6ecc4f16b2 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -41,7 +41,13 @@ using __cuda_atomic_enable_non_native_bitwise = __enable_if_t<_Operand::__size < template using __cuda_atomic_enable_native_bitwise = __enable_if_t<_Operand::__size >= 32, bool>; -template = 0> +template +using __cuda_atomic_enable_non_native_ld_st = __enable_if_t<_Operand::__size <= 8, bool>; + +template +using __cuda_atomic_enable_native_ld_st = __enable_if_t<_Operand::__size >= 16, bool>; + +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { @@ -164,7 +170,7 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or return __expected; } -template = 0> +template = 0> static inline _CCCL_DEVICE void __cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) { diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h index 3b164cfc52..01a0f2e3a5 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h @@ -116,6 +116,162 @@ static inline _CCCL_DEVICE void __cuda_atomic_load_memory_order_dispatch(_Fn &__ ) } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.b16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_u16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.u16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.acquire.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cta.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.cluster.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.gpu.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("ld.mmio.relaxed.sys.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_load( + const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_s16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("ld.volatile.s16 %0,[%1];" : "=h"(__dst) : "l"(__ptr) : "memory"); } template static inline _CCCL_DEVICE void __cuda_atomic_load( const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) @@ -716,6 +872,58 @@ static inline _CCCL_DEVICE void __cuda_atomic_store_memory_order_dispatch(_Fn &_ ) } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.release.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.cta.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.cluster.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.gpu.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_enable) +{ asm volatile("st.mmio.relaxed.sys.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } +template +static inline _CCCL_DEVICE void __cuda_atomic_store( + _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag, __atomic_cuda_mmio_disable) +{ asm volatile("st.volatile.b16 [%0],%1;" :: "l"(__ptr), "h"(__val) : "memory"); } template static inline _CCCL_DEVICE void __cuda_atomic_store( _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b32, __thread_scope_block_tag, __atomic_cuda_mmio_disable) From b1901a281dcfc97af3fbf1d1d416cc218b69cba5 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Wed, 11 Sep 2024 14:46:16 -0700 Subject: [PATCH 29/29] Switch 8b ld/st to 16b ld --- .../cuda/std/__atomic/functions/cuda_ptx_derived.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 6ecc4f16b2..5139b7d603 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -51,13 +51,13 @@ template (__value >> __offset); }