Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable including atomics and friends in TUs that do not support them. #1736

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 47 additions & 15 deletions libcudacxx/include/cuda/std/__atomic/types/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,17 @@ struct __atomic_storage
}
};

#if defined(_CCCL_CUDA_COMPILER)
extern "C" _CCCL_DEVICE void __atomic_is_not_supported_pre_sm_60();
bernhardmgruber marked this conversation as resolved.
Show resolved Hide resolved
#endif
wmaxey marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#endif
#endif // _CCCL_CUDA_COMPILER


_CCCL_HOST_DEVICE inline void __atomic_thread_fence_dispatch(memory_order __order)
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(__atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(__atomic_thread_fence_host(__order);))
}
Expand All @@ -91,8 +97,10 @@ template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_
_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(__atomic_store_n_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(__atomic_store_host(__a->get(), __val, __order);))
}
Expand All @@ -102,8 +110,10 @@ _CCCL_HOST_DEVICE inline auto
__atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(return __atomic_load_n_cuda(__a->get(), static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might need to make this a template that returns __atomic_underlying_t<_Sto> because otherwise we might have no return in that function

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or:

Suggested change
(__atomic_is_not_supported_pre_sm_60();),
(__atomic_is_not_supported_pre_sm_60(); return {};),

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment applies to a lot more places below.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we just should mark the undefined function [[noinline]], that should kill any potential for stupid warnings.

NV_IS_HOST,
(return __atomic_load_host(__a->get(), __order);))
}
Expand All @@ -113,8 +123,10 @@ _CCCL_HOST_DEVICE inline auto
__atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(return __atomic_exchange_n_cuda(__a->get(), __value, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(return __atomic_exchange_host(__a->get(), __value, __order);))
}
Expand All @@ -125,7 +137,7 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
{
bool __result = false;
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(__result = __atomic_compare_exchange_cuda(
__a->get(),
__expected,
Expand All @@ -134,6 +146,8 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
static_cast<__memory_order_underlying_t>(__success),
static_cast<__memory_order_underlying_t>(__failure),
_Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(__result = __atomic_compare_exchange_strong_host(__a->get(), __expected, __val, __success, __failure);))
return __result;
Expand All @@ -145,7 +159,7 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
{
bool __result = false;
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(__result = __atomic_compare_exchange_cuda(
__a->get(),
__expected,
Expand All @@ -154,6 +168,8 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
static_cast<__memory_order_underlying_t>(__success),
static_cast<__memory_order_underlying_t>(__failure),
_Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(__result = __atomic_compare_exchange_weak_host(__a->get(), __expected, __val, __success, __failure);))
return __result;
Expand All @@ -164,8 +180,10 @@ _CCCL_HOST_DEVICE inline auto
__atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(return __atomic_fetch_add_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(return __atomic_fetch_add_host(__a->get(), __delta, __order);))
}
Expand All @@ -175,8 +193,10 @@ _CCCL_HOST_DEVICE inline auto
__atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(return __atomic_fetch_sub_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(return __atomic_fetch_sub_host(__a->get(), __delta, __order);))
}
Expand All @@ -186,8 +206,10 @@ _CCCL_HOST_DEVICE inline auto
__atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(return __atomic_fetch_and_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(return __atomic_fetch_and_host(__a->get(), __pattern, __order);))
}
Expand All @@ -197,8 +219,10 @@ _CCCL_HOST_DEVICE inline auto
__atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(return __atomic_fetch_or_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(return __atomic_fetch_or_host(__a->get(), __pattern, __order);))
}
Expand All @@ -208,8 +232,10 @@ _CCCL_HOST_DEVICE inline auto
__atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE,
NV_PROVIDES_SM_60,
(return __atomic_fetch_xor_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(return __atomic_fetch_xor_host(__a->get(), __pattern, __order);))
}
Expand All @@ -218,19 +244,25 @@ template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_
_CCCL_HOST_DEVICE inline auto
__atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
{
NV_IF_TARGET(
NV_IS_DEVICE,
NV_DISPATCH_TARGET(
NV_PROVIDES_SM_60,
(return __atomic_fetch_max_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(return __atomic_fetch_max_host(__a->get(), __val, __order);))
}

template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
_CCCL_HOST_DEVICE inline auto
__atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
{
NV_IF_TARGET(
NV_IS_DEVICE,
NV_DISPATCH_TARGET(
NV_PROVIDES_SM_60,
(return __atomic_fetch_min_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
NV_IS_DEVICE,
(__atomic_is_not_supported_pre_sm_60();),
NV_IS_HOST,
(return __atomic_fetch_min_host(__a->get(), __val, __order);))
}

Expand Down
8 changes: 2 additions & 6 deletions libcudacxx/include/cuda/std/__cuda/barrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@

#include <cuda/std/detail/__config>

#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
# error "CUDA synchronization primitives are only supported for sm_70 and up."
#endif

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
Expand Down Expand Up @@ -218,7 +214,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
// Need 2 instructions, can't finish barrier with arrive > 1
if (__update > 1) { _CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1); } __token =
_CUDA_VPTX::mbarrier_arrive(__bh);),
NV_IS_DEVICE,
NV_PROVIDES_SM_70,
(
if (!__isShared(&__barrier)) { return __barrier.arrive(__update); }

Expand All @@ -237,7 +233,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
if (__leader == static_cast<int>(__laneid)) {
__token = __barrier.arrive(__inc);
} __token = __shfl_sync(__active, __token, __leader);),
NV_IS_HOST,
NV_ANY_TARGET,
(__token = __barrier.arrive(__update);))
return __token;
}
Expand Down
4 changes: 0 additions & 4 deletions libcudacxx/include/cuda/std/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@
#ifndef _CUDA_STD_BARRIER
#define _CUDA_STD_BARRIER

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
# error "CUDA synchronization primitives are only supported for sm_70 and up."
#endif

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
Expand Down
4 changes: 0 additions & 4 deletions libcudacxx/include/cuda/std/latch
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@
#ifndef _CUDA_STD_LATCH
#define _CUDA_STD_LATCH

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
# error "CUDA synchronization primitives are only supported for sm_70 and up."
#endif

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
Expand Down
4 changes: 0 additions & 4 deletions libcudacxx/include/cuda/std/semaphore
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@
#ifndef _CUDA_STD_SEMAPHORE
#define _CUDA_STD_SEMAPHORE

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
# error "CUDA synchronization primitives are only supported for sm_70 and up."
#endif

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/test/public_headers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ function(libcudacxx_add_public_header_test header)
-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)

# Ensure that if this is an atomic header, we only include the right architectures
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is stale with these changes.

string(REGEX MATCH "atomic|barrier|latch|semaphore|annotated_ptr|pipeline" match "${header}")
string(REGEX MATCH "annotated_ptr|pipeline" match "${header}")
if(match)
# Ensure that we only compile the header when we have some architectures enabled
if (NOT architectures_at_least_sm70)
Expand Down
Loading