diff --git a/CMakePresets.json b/CMakePresets.json index bd9374778b..ecc9b22761 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -357,7 +357,8 @@ "libcudacxx.test.internal_headers", "libcudacxx.test.public_headers", "libcudacxx.test.public_headers_host_only", - "libcudacxx.test.lit.precompile" + "libcudacxx.test.lit.precompile", + "libcudacxx.test.atomics.ptx" ] }, { @@ -479,7 +480,7 @@ ], "filter": { "exclude": { - "name": "^libcudacxx\\.test\\.(lit|atomics\\.codegen\\.diff)$" + "name": "^libcudacxx\\.test\\.lit$" } } }, diff --git a/libcudacxx/codegen/generators/compare_and_swap.h b/libcudacxx/codegen/generators/compare_and_swap.h index 5a970735c0..17b11ab69c 100644 --- a/libcudacxx/codegen/generators/compare_and_swap.h +++ b/libcudacxx/codegen/generators/compare_and_swap.h @@ -83,7 +83,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( }; constexpr size_t supported_sizes[] = { - 16, 32, 64, 128, diff --git a/libcudacxx/codegen/generators/exchange.h b/libcudacxx/codegen/generators/exchange.h index dcfe66f147..f89a0075ae 100644 --- a/libcudacxx/codegen/generators/exchange.h +++ b/libcudacxx/codegen/generators/exchange.h @@ -81,7 +81,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange( }; constexpr size_t supported_sizes[] = { - 16, 32, 64, 128, diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h index 0e525bf296..5139b7d603 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h @@ -13,8 +13,6 @@ #include -#include "cuda_ptx_generated.h" - #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -33,6 +31,250 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD #if defined(_CCCL_CUDA_COMPILER) +template +using __cuda_atomic_enable_non_native_arithmetic = + __enable_if_t<_Operand::__size <= 16 || _Operand::__op == __atomic_cuda_operand::_f, bool>; + +template +using __cuda_atomic_enable_non_native_bitwise = __enable_if_t<_Operand::__size <= 16, bool>; + +template +using __cuda_atomic_enable_native_bitwise = __enable_if_t<_Operand::__size >= 32, bool>; + +template +using __cuda_atomic_enable_non_native_ld_st = __enable_if_t<_Operand::__size <= 8, bool>; + +template +using __cuda_atomic_enable_native_ld_st = __enable_if_t<_Operand::__size >= 16, bool>; + +template = 0> +static inline _CCCL_DEVICE void +__cuda_atomic_load(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) +{ + constexpr uint64_t __alignmask = (sizeof(uint16_t) - 1); + uint16_t* __aligned = (uint16_t*) ((intptr_t) __ptr & (~__alignmask)); + const uint8_t __offset = uint16_t((intptr_t) __ptr & __alignmask) * 8; + + uint16_t __value = 0; + __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}, __atomic_cuda_mmio_disable{}); + + __dst = static_cast<_Type>(__value >> __offset); +} + +template = 0> +static inline _CCCL_DEVICE bool +__cuda_atomic_compare_exchange(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco) +{ + constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1); + constexpr uint32_t __sizemask = (1 << (sizeof(_Type) * 8)) - 1; + uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); + const uint8_t __offset = uint32_t((intptr_t) __ptr & __alignmask) * 8; + const uint32_t __valueMask = __sizemask << __offset; + const uint32_t __windowMask = ~__valueMask; + const uint32_t __cmpOffset = __cmp << __offset; + const uint32_t __opOffset = __op << __offset; + + // Algorithm for 8b CAS with 32b intrinsics + // __old = __window[0:32] where [__cmp] resides within some offset. + uint32_t __old; + // Start by loading __old with the current value, this optimizes for early return when __cmp is wrong + NV_IF_TARGET( + NV_PROVIDES_SM_70, + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});), + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});)) + // Reemit CAS instructions until we succeed or the old value is a mismatch + while (__cmpOffset == (__old & __valueMask)) + { + // Combine the desired value and most recently fetched expected masked portion of the window + const uint32_t __attempt = (__old & __windowMask) | __opOffset; + + if (__cuda_atomic_compare_exchange( + __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{})) + { + // CAS was successful + return true; + } + } + __dst = static_cast<_Type>(__old >> __offset); + return false; +} + +// Optimized fetch_update CAS loop with op determined after first load reducing waste. +template = 0> +_CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco) +{ + constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1); + constexpr uint32_t __sizemask = (1 << (sizeof(_Type) * 8)) - 1; + uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); + const uint8_t __offset = uint8_t((intptr_t) __ptr & __alignmask) * 8; + const uint32_t __valueMask = __sizemask << __offset; + const uint32_t __windowMask = ~__valueMask; + + // 8/16b fetch update is similar to CAS implementation, but compresses the logic for recalculating the operand + // __old = __window[0:32] where [__cmp] resides within some offset. + uint32_t __old; + NV_IF_TARGET( + NV_PROVIDES_SM_70, + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});), + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});)) + + // Reemit CAS instructions until we succeed + while (1) + { + // Calculate new desired value from last fetched __old + // Use of the value mask is required due to the possibility of overflow when ops are widened. Possible compiler bug? + const uint32_t __attempt = + ((static_cast(__op(static_cast<_Type>(__old >> __offset))) << __offset) & __valueMask) + | (__old & __windowMask); + + if (__cuda_atomic_compare_exchange( + __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{})) + { + // CAS was successful + return static_cast<_Type>(__old >> __offset); + } + } +} + +// Optimized fetch_update CAS loop with op determined after first load reducing waste. +template = 0> +_CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco) +{ + _Type __expected = 0; + NV_IF_TARGET( + NV_PROVIDES_SM_70, + (__cuda_atomic_load( + __ptr, __expected, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});), + (__cuda_atomic_load( + __ptr, __expected, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});)) + + _Type __desired = __op(__expected); + while (!__cuda_atomic_compare_exchange(__ptr, __expected, __expected, __desired, _Order{}, _Operand{}, _Sco{})) + { + __desired = __op(__expected); + } + return __expected; +} + +template = 0> +static inline _CCCL_DEVICE void +__cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_cuda_mmio_disable) +{ + // Store requires cas on 8/16b types + __cuda_atomic_fetch_update( + __ptr, + [__val](_Type __old) { + return __val; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old + __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old & __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old ^ __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old | __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __old < __op ? __old : __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __op < __old ? __old : __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + +template = 0> +static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco) +{ + __dst = __cuda_atomic_fetch_update( + __ptr, + [__op](_Type __old) { + return __op; + }, + _Order{}, + __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{}, + _Sco{}); +} + template _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco) { @@ -98,7 +340,7 @@ _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __ } template = 0> -_CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) +_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) { return __atomic_fetch_update_cuda( __ptr, @@ -109,7 +351,7 @@ _CCCL_DEVICE float __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder _Sco{}); } template = 0> -_CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) +_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) { return __atomic_fetch_update_cuda( __ptr, @@ -121,7 +363,7 @@ _CCCL_DEVICE float __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int _ } template = 0> -_CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) +_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco) { return __atomic_fetch_update_cuda( __ptr, @@ -132,7 +374,7 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorde _Sco{}); } template = 0> -_CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) +_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco) { return __atomic_fetch_update_cuda( __ptr, @@ -143,262 +385,6 @@ _CCCL_DEVICE double __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int _Sco{}); } -// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -// _CCCL_DEVICE bool __atomic_compare_exchange_cuda( -// void volatile* __ptr, -// _Tp* __expected, -// const _Tp __desired, -// bool __weak, -// int __success_memorder, -// int __failure_memorder, -// _Sco) -// { -// using __proxy_t = _If; -// __proxy_t __old = 0; -// __proxy_t __new = 0; -// memcpy(&__old, __expected, sizeof(__proxy_t)); -// memcpy(&__new, &__desired, sizeof(__proxy_t)); -// bool __result = -// __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{}); -// memcpy(__expected, &__old, sizeof(__proxy_t)); -// return __result; -// } -// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -// _CCCL_DEVICE bool __atomic_compare_exchange_cuda( -// void* __ptr, _Tp* __expected, const _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, -// _Sco) -// { -// using __proxy_t = _If; -// __proxy_t __old = 0; -// __proxy_t __new = 0; -// memcpy(&__old, __expected, sizeof(__proxy_t)); -// memcpy(&__new, &__desired, sizeof(__proxy_t)); -// bool __result = -// __atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{}); -// memcpy(__expected, &__old, sizeof(__proxy_t)); -// return __result; -// } -// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -// _CCCL_DEVICE void __atomic_exchange_cuda(void volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) -// { -// using __proxy_t = _If; -// __proxy_t __old = 0; -// __proxy_t __new = 0; -// memcpy(&__new, __val, sizeof(__proxy_t)); -// __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{}); -// memcpy(__ret, &__old, sizeof(__proxy_t)); -// } -// template ::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0> -// _CCCL_DEVICE void __atomic_exchange_cuda(void* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) -// { -// using __proxy_t = _If; -// __proxy_t __old = 0; -// __proxy_t __new = 0; -// memcpy(&__new, __val, sizeof(__proxy_t)); -// __atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{}); -// memcpy(__ret, &__old, sizeof(__proxy_t)); -// } - -// template = 0> -// _CCCL_DEVICE bool __atomic_compare_exchange_cuda( -// _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, -// _Sco) -// { -// auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1)); -// auto const __offset = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8; -// auto const __mask = ((1 << sizeof(_Tp) * 8) - 1) << __offset; - -// uint32_t __old = *__expected << __offset; -// uint32_t __old_value; -// while (1) -// { -// __old_value = (__old & __mask) >> __offset; -// if (__old_value != *__expected) -// { -// break; -// } -// uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset); -// if (__atomic_compare_exchange_cuda( -// __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{})) -// { -// return true; -// } -// } -// *__expected = __old_value; -// return false; -// } - -// template = 0> -// _CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{})) -// ; -// *__ret = __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected + __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected + __val; -// } -// return __expected; -// } - -// template ::value, int> = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected > __val ? __expected : __val; - -// while (__desired == __val -// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected > __val ? __expected : __val; -// } - -// return __expected; -// } -// template ::value, int> = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected > __val ? __expected : __val; - -// while (__desired == __val -// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected > __val ? __expected : __val; -// } - -// return __expected; -// } - -// template ::value, int> = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp * __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected < __val ? __expected : __val; - -// while (__desired == __val -// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected < __val ? __expected : __val; -// } - -// return __expected; -// } -// template ::value, int> = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected < __val ? __expected : __val; - -// while (__desired == __val -// && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected < __val ? __expected : __val; -// } - -// return __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected - __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected - __val; -// } -// return __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected & __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected & __val; -// } -// return __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected ^ __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected ^ __val; -// } -// return __expected; -// } - -// template = 0> -// _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco) -// { -// _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{}); -// _Tp __desired = __expected | __val; -// while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{})) -// { -// __desired = __expected | __val; -// } -// return __expected; -// } - -// template -// _CCCL_DEVICE bool __atomic_compare_exchange_n_cuda( -// _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, -// _Sco) -// { -// return __atomic_compare_exchange_cuda( -// __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{}); -// } - -// template -// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco) -// { -// _Tp __ret; -// __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{}); -// return __ret; -// } -// template -// _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco) -// { -// _Tp __ret; -// __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{}); -// return __ret; -// } - _CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int) { asm volatile("" ::: "memory"); diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h index e72144b68c..01a0f2e3a5 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h @@ -1220,86 +1220,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_swap_memory_order_dispatch return __res; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.acquire.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.acquire.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.acquire.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.acquire.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.relaxed.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.relaxed.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.relaxed.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.relaxed.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.release.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.release.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.release.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.release.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.acq_rel.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.acq_rel.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.acq_rel.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.acq_rel.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.cas.cta.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.cas.cluster.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.cas.gpu.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } -template -static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( - _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.cas.sys.b16 %0,[%1],%2,%3;" : "=h"(__dst) : "l"(__ptr), "h"(__cmp), "h"(__op) : "memory"); return __dst == __cmp; } template static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag) @@ -1763,86 +1683,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange_memory_order_dispatch(_Fn ) } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.acquire.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.acquire.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.acquire.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.acquire.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.relaxed.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.relaxed.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.relaxed.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.relaxed.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.release.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.release.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.release.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.release.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.acq_rel.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.acq_rel.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.acq_rel.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.acq_rel.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_block_tag) -{ asm volatile("atom.exch.cta.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_cluster_tag) -{ asm volatile("atom.exch.cluster.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_device_tag) -{ asm volatile("atom.exch.gpu.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } -template -static inline _CCCL_DEVICE void __cuda_atomic_exchange( - _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b16, __thread_scope_system_tag) -{ asm volatile("atom.exch.sys.b16 %0,[%1],%2;" : "=h"(__old) : "l"(__ptr), "h"(__new) : "memory"); } template static inline _CCCL_DEVICE void __cuda_atomic_exchange( _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b32, __thread_scope_block_tag) diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h index 861e9f7b08..9a4c2d5bfc 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h @@ -69,8 +69,15 @@ enum class __atomic_cuda_operand template <__atomic_cuda_operand _Op, size_t _Size> struct __atomic_cuda_operand_tag -{}; +{ + static constexpr auto __op = _Op; + static constexpr auto __size = _Size; +}; +using __atomic_cuda_operand_f8 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 8>; +using __atomic_cuda_operand_s8 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 8>; +using __atomic_cuda_operand_u8 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 8>; +using __atomic_cuda_operand_b8 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, 8>; using __atomic_cuda_operand_f16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_f, 16>; using __atomic_cuda_operand_s16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_s, 16>; using __atomic_cuda_operand_u16 = __atomic_cuda_operand_tag<__atomic_cuda_operand::_u, 16>; @@ -103,13 +110,15 @@ struct __atomic_longlong2 template using __atomic_cuda_deduce_bitwise = - _If, - _If, - _If, - __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>; + _If, + _If, + _If, + _If, + __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>>; template using __atomic_cuda_deduce_arithmetic = @@ -118,37 +127,56 @@ using __atomic_cuda_deduce_arithmetic = __atomic_cuda_operand_deduction, __atomic_cuda_operand_deduction>, _If<_CCCL_TRAIT(is_signed, _Type), - _If, - __atomic_cuda_operand_deduction>, // There is no atom.add.s64 - _If, - __atomic_cuda_operand_deduction>>>; + _If, + _If, + _If, + __atomic_cuda_operand_deduction>>>, // There is no + // atom.add.s64 + _If, + _If, + _If, + __atomic_cuda_operand_deduction>>>>>; template using __atomic_cuda_deduce_minmax = - _If<_CCCL_TRAIT(is_signed, _Type), - _If, - __atomic_cuda_operand_deduction>, + _If<_CCCL_TRAIT(is_floating_point, _Type), _If, - __atomic_cuda_operand_deduction>>; + __atomic_cuda_operand_deduction, + __atomic_cuda_operand_deduction>, + _If<_CCCL_TRAIT(is_signed, _Type), + _If, + _If, + _If, + __atomic_cuda_operand_deduction>>>, // atom.min|max.s64 + // supported + _If, + _If, + _If, + __atomic_cuda_operand_deduction>>>>>; template using __atomic_enable_if_native_bitwise = bool; template -using __atomic_enable_if_native_arithmetic = typename enable_if<_CCCL_TRAIT(is_scalar, _Type), bool>::type; - -template -using __atomic_enable_if_not_native_arithmetic = typename enable_if::type; +using __atomic_enable_if_native_arithmetic = __enable_if_t<_CCCL_TRAIT(is_scalar, _Type), bool>; template -using __atomic_enable_if_native_minmax = typename enable_if<_CCCL_TRAIT(is_integral, _Type), bool>::type; +using __atomic_enable_if_native_minmax = __enable_if_t<_CCCL_TRAIT(is_integral, _Type), bool>; template -using __atomic_enable_if_not_native_minmax = typename enable_if::type; +using __atomic_enable_if_not_native_minmax = __enable_if_t; _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h index 8afa9756ef..33d11fd9ad 100644 --- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h +++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h @@ -72,9 +72,9 @@ static inline void __atomic_thread_fence(int __memorder) } template -using _enable_if_sized_as = typename enable_if::type; +using __enable_if_sized_as = __enable_if_t; -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) { # ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN @@ -84,7 +84,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) # endif *__ret = reinterpret_cast<_Type&>(__tmp); } -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) { # ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN @@ -94,7 +94,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) # endif *__ret = reinterpret_cast<_Type&>(__tmp); } -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) { # ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN @@ -104,7 +104,7 @@ void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) # endif *__ret = reinterpret_cast<_Type&>(__tmp); } -template = 0> +template = 0> void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret) { # ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN @@ -136,7 +136,7 @@ void __atomic_load(const volatile _Type* __ptr, _Type* __ret, int __memorder) } } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) { auto __t = reinterpret_cast<__int8*>(__val); @@ -147,7 +147,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) __iso_volatile_store8(__d, *__t); # endif } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) { auto __t = reinterpret_cast<__int16*>(__val); @@ -158,7 +158,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) __iso_volatile_store16(__d, *__t); # endif } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) { auto __t = reinterpret_cast<__int32*>(__val); @@ -170,7 +170,7 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) __iso_volatile_store32(__d, *__t); # endif } -template = 0> +template = 0> void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val) { auto __t = reinterpret_cast<__int64*>(__val); @@ -202,7 +202,7 @@ void __atomic_store(volatile _Type* __ptr, _Type* __val, int __memorder) } } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired) { auto __tmp_desired = reinterpret_cast(*__desired); @@ -215,7 +215,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp *__expected = reinterpret_cast(__old); return false; } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired) { auto __tmp_desired = reinterpret_cast(*__desired); @@ -228,7 +228,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp *__expected = reinterpret_cast(__old); return false; } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired) { auto __tmp_desired = reinterpret_cast(*__desired); @@ -241,7 +241,7 @@ bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __exp *__expected = reinterpret_cast(__old); return false; } -template = 0> +template = 0> bool __atomic_compare_exchange_relaxed(const volatile _Type* __ptr, _Type* __expected, const _Type* __desired) { auto __tmp_desired = reinterpret_cast(*__desired); @@ -287,25 +287,25 @@ bool __atomic_compare_exchange( return success; } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret) { auto const __old = _InterlockedExchange8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret) { auto const __old = _InterlockedExchange16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret) { auto const __old = _InterlockedExchange((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_exchange_relaxed(const volatile _Type* __ptr, const _Type* __val, _Type* __ret) { auto const __old = _InterlockedExchange64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); @@ -341,25 +341,25 @@ void __atomic_exchange(_Type volatile* __ptr, const _Type* __val, _Type* __ret, } } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedExchangeAdd8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedExchangeAdd16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedExchangeAdd((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_add_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedExchangeAdd64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); @@ -404,25 +404,25 @@ _Type __atomic_fetch_sub(_Type volatile* __ptr, _Delta __val, int __memorder) return __atomic_fetch_add(__ptr, 0 - __val, __memorder); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedAnd8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedAnd16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedAnd((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_and_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedAnd64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); @@ -462,25 +462,25 @@ _Type __atomic_fetch_and(_Type volatile* __ptr, _Delta __val, int __memorder) return *__dest; } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedXor8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedXor16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedXor((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_xor_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedXor64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); @@ -520,25 +520,25 @@ _Type __atomic_fetch_xor(_Type volatile* __ptr, _Delta __val, int __memorder) return *__dest; } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedOr8((volatile char*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedOr16((volatile short*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedOr((volatile long*) __ptr, reinterpret_cast(*__val)); *__ret = reinterpret_cast<_Type const&>(__old); } -template = 0> +template = 0> void __atomic_fetch_or_relaxed(const volatile _Type* __ptr, const _Delta* __val, _Type* __ret) { auto const __old = _InterlockedOr64((volatile __int64*) __ptr, reinterpret_cast<__int64 const&>(*__val)); diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h index 4b58ba4901..73e5ef0044 100644 --- a/libcudacxx/include/cuda/std/__atomic/types.h +++ b/libcudacxx/include/cuda/std/__atomic/types.h @@ -32,10 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template struct __atomic_traits { - static constexpr bool __atomic_requires_lock = !__atomic_is_always_lock_free<_Tp>::__value; - static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4; - static constexpr bool __atomic_supports_reference = - __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8); + static constexpr bool __atomic_requires_lock = !__atomic_is_always_lock_free<_Tp>::__value; + static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4; + static constexpr bool __atomic_supports_reference = __atomic_is_always_lock_free<_Tp>::__value && sizeof(_Tp) <= 8; }; template diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt index b20a0f3fed..0b0f18a7c5 100644 --- a/libcudacxx/test/CMakeLists.txt +++ b/libcudacxx/test/CMakeLists.txt @@ -41,9 +41,4 @@ if (LIBCUDACXX_TEST_WITH_NVRTC) add_subdirectory(utils/nvidia/nvrtc) endif() -find_program(filecheck "FileCheck") - -if (filecheck) - message("-- ${filecheck} found... building atomic codegen tests") - add_subdirectory(atomic_codegen) -endif() +add_subdirectory(atomic_codegen) diff --git a/libcudacxx/test/atomic_codegen/CMakeLists.txt b/libcudacxx/test/atomic_codegen/CMakeLists.txt index 095fa41cf7..d881e823f8 100644 --- a/libcudacxx/test/atomic_codegen/CMakeLists.txt +++ b/libcudacxx/test/atomic_codegen/CMakeLists.txt @@ -1,6 +1,13 @@ add_custom_target(libcudacxx.test.atomics.ptx) -find_program(filecheck "FileCheck" REQUIRED) +find_program(filecheck "FileCheck") + +if (filecheck) + message("-- ${filecheck} found... building atomic codegen tests") +else() + return() +endif() + find_program(cuobjdump "cuobjdump" REQUIRED) find_program(bash "bash" REQUIRED) @@ -16,6 +23,14 @@ foreach(test_path IN LISTS libcudacxx_atomic_codegen_tests) STATIC ${test_path} ) + set_target_properties( + atomic_codegen_${test_name} + PROPERTIES + CUDA_ARCHITECTURES "80" + ) + + target_compile_options(atomic_codegen_${test_name} PRIVATE "-Wno-comment") + ## Important for testing the local headers target_include_directories(atomic_codegen_${test_name} PRIVATE "${libcudacxx_SOURCE_DIR}/include") add_dependencies(libcudacxx.test.atomics.ptx atomic_codegen_${test_name}) diff --git a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu index d97636d647..9d1317e021 100644 --- a/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_add_non_volatile.cu @@ -10,13 +10,11 @@ __global__ void add_relaxed_device_non_volatile(int* data, int* out, int n) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*add_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu index e0b6c3d151..99c73f4c32 100644 --- a/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_cas_non_volatile.cu @@ -6,18 +6,17 @@ __global__ void cas_device_relaxed_non_volatile(int* data, int* out, int n) ref.compare_exchange_strong(*out, n, cuda::std::memory_order_relaxed); } +// clang-format off /* ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*cas_device_relaxed_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; ; SM8X-NEXT: ld.global.u32 %r[[#LOCALEXP:]], [%rd[[#INPUT]]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.cas.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#LOCALEXP]],%r[[#INPUT]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}atom.cas.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#LOCALEXP]],%r[[#INPUT]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu index 787fcd30bb..59c4d52d97 100644 --- a/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_exch_non_volatile.cu @@ -10,13 +10,11 @@ __global__ void exch_device_relaxed_non_volatile(int* data, int* out, int n) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*exch_device_relaxed_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}atom.exch.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu index 6bef7f1c2a..804f953294 100644 --- a/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_load_non_volatile.cu @@ -10,12 +10,10 @@ __global__ void load_relaxed_device_non_volatile(int* data, int* out) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*load_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], [[[FUNCTION]]_param_1]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#EXPECTED:]], {{.*}}[[FUNCTION]]_param_1{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#EXPECTED]]; -; SM8X-NEXT: // -; SM8X-NEXT: ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}ld.relaxed.gpu.b32 %r[[#DEST:]],[%rd[[#ATOM]]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu index 983c8e9fac..4dcae5d176 100644 --- a/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_store_non_volatile.cu @@ -10,11 +10,9 @@ __global__ void store_relaxed_device_non_volatile(int* data, int in) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*store_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_1]; -; SM8X-NEXT: // -; SM8X-NEXT: st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]]; -; SM8X-NEXT: // +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_1{{.*}} +; SM8X-NEXT: {{/*[[:space:]] *}}st.relaxed.gpu.b32 [%rd[[#ATOM]]],%r[[#INPUT]];{{[[:space:]]/*}} ; SM8X-NEXT: ret; */ diff --git a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu index 9d1ffaefa1..b8a40fee97 100644 --- a/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu +++ b/libcudacxx/test/atomic_codegen/atomic_sub_non_volatile.cu @@ -10,14 +10,12 @@ __global__ void sub_relaxed_device_non_volatile(int* data, int* out, int n) ; SM8X-LABEL: .target sm_80 ; SM8X: .visible .entry [[FUNCTION:_.*sub_relaxed_device_non_volatile.*]]( -; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], [[[FUNCTION]]_param_0]; -; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], [[[FUNCTION]]_param_1]; -; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], [[[FUNCTION]]_param_2]; +; SM8X-DAG: ld.param.u64 %rd[[#ATOM:]], {{.*}}[[FUNCTION]]_param_0{{.*}} +; SM8X-DAG: ld.param.u64 %rd[[#RESULT:]], {{.*}}[[FUNCTION]]_param_1{{.*}} +; SM8X-DAG: ld.param.u32 %r[[#INPUT:]], {{.*}}[[FUNCTION]]_param_2{{.*}} ; SM8X-NEXT: cvta.to.global.u64 %rd[[#GOUT:]], %rd[[#RESULT]]; ; SM8X-NEXT: neg.s32 %r[[#NEG:]], %r[[#INPUT]]; -; SM8X-NEXT: // -; SM8X-NEXT: atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]]; -; SM8X-NEXT: // +; SM8X-NEXT: {{/*[[:space:]] *}}atom.add.relaxed.gpu.s32 %r[[#DEST:]],[%rd[[#ATOM]]],%r[[#NEG]];{{[[:space:]]/*}} ; SM8X-NEXT: st.global.u32 [%rd[[#GOUT]]], %r[[#DEST]]; ; SM8X-NEXT: ret; diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash old mode 100644 new mode 100755 index a5c62e91bd..c66bb43363 --- a/libcudacxx/test/atomic_codegen/dump_and_check.bash +++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash @@ -6,4 +6,4 @@ input_archive=$1 input_testfile=$2 input_prefix=$3 -cuobjdump --dump-ptx $input_archive | FileCheck --check-prefix $input_prefix $input_testfile +cuobjdump --dump-ptx $input_archive | FileCheck --match-full-lines --check-prefix $input_prefix $input_testfile diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp new file mode 100644 index 0000000000..ab86ca5222 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic_ref_small.pass.cpp @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: libcpp-has-no-threads, pre-sm-60 +// UNSUPPORTED: windows && pre-sm-70 + +// + +#include +#include +#include + +#include "test_macros.h" + +/* +Test goals: +Interleaved 8b/16b access to a 32b window while there is thread contention. + +for 8b: +Launch 1024 threads, fetch_add(1) each window, value at end of kernel should be 0xFF..FF. This checks for corruption +caused by interleaved access to different parts of the window. + +for 16b: +Launch 1024 threads, fetch_add(1), checking for 0x01FF01FF. +*/ + +template +__host__ __device__ void fetch_add_into_window(T* window, uint16_t* atomHistory) +{ + typedef cuda::atomic_ref Atom; + + Atom a(*window); + *atomHistory = a.fetch_add(Inc); +} + +template +__device__ void device_do_test(uint32_t expected) +{ + constexpr uint32_t threadCount = 1024; + constexpr uint32_t histogramResultCount = 256 * sizeof(T); + constexpr uint32_t histogramEntriesPerThread = 4 / sizeof(T); + + __shared__ uint16_t atomHistory[threadCount]; + __shared__ uint8_t atomHistogram[histogramResultCount]; + __shared__ uint32_t atomicStorage; + + cuda::atomic_ref bucket(atomicStorage); + + constexpr uint32_t offsetMask = ((4 / sizeof(T)) - 1); + // Access offset is interleaved meaning threads 4, 5, 6, 7 access window 0, 1, 2, 3 and so on. + const uint32_t threadOffset = threadIdx.x & offsetMask; + + if (threadIdx.x == 0) + { + memset(atomHistogram, 0, histogramResultCount); + bucket.store(0); + } + __syncthreads(); + + T* window = reinterpret_cast(&atomicStorage) + threadOffset; + fetch_add_into_window(window, atomHistory + threadIdx.x); + + __syncthreads(); + if (threadIdx.x == 0) + { + // For each thread, add its atomic result into the corresponding bucket + for (uint32_t i = 0; i < threadCount; i++) + { + atomHistogram[atomHistory[i]]++; + } + // Check that each bucket has exactly (4 / sizeof(T)) entries + // This checks that atomic fetch operations were sequential. i.e. 4xfetch_add(1) returns [0, 1, 2, 3] + for (uint32_t i = 0; i < histogramResultCount; i++) + { + assert(atomHistogram[i] == histogramEntriesPerThread); + } + printf("expected: 0x%X\r\n", expected); + printf("result: 0x%X\r\n", bucket.load()); + assert(bucket.load() == expected); + } +} + +int main(int, char**) +{ + NV_DISPATCH_TARGET(NV_IS_HOST, + (cuda_thread_count = 1024;), + NV_IS_DEVICE, + (device_do_test(0); device_do_test(0x02000200);)); + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp index ddef76ec28..4cae377b27 100644 --- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp +++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_cuda.pass.cpp @@ -175,19 +175,14 @@ using bitwise_atomic_testers = void kernel_invoker() { -// todo -#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL validate_pinned(); validate_pinned(); -#endif validate_pinned(); validate_pinned(); validate_pinned(); -#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL validate_pinned(); validate_pinned(); -#endif validate_pinned(); validate_pinned(); validate_pinned(); diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp index 9da187cee7..366b93b82f 100644 --- a/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp +++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic/reference_std.pass.cpp @@ -167,19 +167,14 @@ using bitwise_atomic_testers = void kernel_invoker() { -// todo -#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL validate_pinned(); validate_pinned(); -#endif validate_pinned(); validate_pinned(); validate_pinned(); -#ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL validate_pinned(); validate_pinned(); -#endif validate_pinned(); validate_pinned(); validate_pinned(); diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp index 56153f3664..5ef493b6dd 100644 --- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp @@ -177,6 +177,10 @@ __host__ __device__ void test_for_all_types() #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS test, char32_t, Selector>(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS + test, int8_t, Selector>(); + test, uint8_t, Selector>(); + test, int16_t, Selector>(); + test, uint16_t, Selector>(); test, int32_t, Selector>(); test, uint32_t, Selector>(); test, int64_t, Selector>(); diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp index b237c862a5..57c4a10f53 100644 --- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp @@ -177,6 +177,10 @@ __host__ __device__ void test_for_all_types() #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS test, char32_t, Selector>(); #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS + test, int8_t, Selector>(); + test, uint8_t, Selector>(); + test, int16_t, Selector>(); + test, uint16_t, Selector>(); test, int32_t, Selector>(); test, uint32_t, Selector>(); test, int64_t, Selector>();