Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cuda::device::barrier_expect_tx #498

Merged
merged 6 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 19 additions & 10 deletions libcudacxx/.upstream-tests/test/cuda/barrier/arrive_tx.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,36 +46,45 @@ void mbarrier_complete_tx(
);
}

template<typename Barrier>
template<bool split_arrive_and_expect>
__device__
void thread(Barrier& b, int arrives_per_thread)
void thread(cuda::barrier<cuda::thread_scope_block>& b, int arrives_per_thread)
{
constexpr int tx_count = 1;
auto tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count);
typename cuda::barrier<cuda::thread_scope_block>::arrival_token tok;

if _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 (split_arrive_and_expect) {
cuda::device::barrier_expect_tx(b, tx_count);
tok = b.arrive(arrives_per_thread);
} else{
tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count);
}

// Manually increase the transaction count of the barrier.
mbarrier_complete_tx(b, tx_count);

b.wait(cuda::std::move(tok));
}

template<bool split_arrive_and_expect>
__device__
void test()
{
NV_DISPATCH_TARGET(
NV_IS_DEVICE, (
// Run all threads, each arriving with arrival count 1
constexpr auto block = cuda::thread_scope_block;
using barrier_t = cuda::barrier<cuda::thread_scope_block>;

__shared__ cuda::barrier<block> bar_1;
init(&bar_1, (int) blockDim.x);
shared_memory_selector<barrier_t, constructor_initializer> sel_1;
barrier_t* bar_1 = sel_1.construct(blockDim.x);
__syncthreads();
thread(bar_1, 1);
thread<split_arrive_and_expect>(*bar_1, 1);

// Run all threads, each arriving with arrival count 2
__shared__ cuda::barrier<block> bar_2;
init(&bar_2, (int) 2 * blockDim.x);
shared_memory_selector<barrier_t, constructor_initializer> sel_2;
barrier_t* bar_2 = sel_2.construct(2 * blockDim.x);
__syncthreads();
thread(bar_2, 2);
thread<split_arrive_and_expect>(*bar_2, 2);
)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ int main(int, char**)
cuda_thread_count = 256;
),
NV_IS_DEVICE, (
test();
constexpr bool split_arrive_and_expect = false;
test<split_arrive_and_expect>();
)
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ int main(int, char**)
cuda_thread_count = 2;
),
NV_IS_DEVICE, (
test();
constexpr bool split_arrive_and_expect = false;
test<split_arrive_and_expect>();
)
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ int main(int, char**)
cuda_thread_count = 32;
),
NV_IS_DEVICE, (
test();
constexpr bool split_arrive_and_expect = false;
test<split_arrive_and_expect>();
)
);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include "arrive_tx.h"

int main(int, char**)
{
NV_DISPATCH_TARGET(
NV_IS_HOST, (
// Required by concurrent_agents_launch to know how many we're
// launching. This can only be an int, because the nvrtc tests use grep
// to figure out how many threads to launch.
cuda_thread_count = 256;
),
NV_IS_DEVICE, (
constexpr bool split_arrive_and_expect = true;
test<split_arrive_and_expect>();
)
);

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include <cuda/barrier>
#include "test_macros.h"

// Suppress warning about barrier in shared memory
TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)

__device__ uint64_t bar_storage;

int main(int, char**){
NV_IF_TARGET(
NV_IS_DEVICE, (
cuda::barrier<cuda::thread_scope_block> *bar_ptr;
bar_ptr = reinterpret_cast<cuda::barrier<cuda::thread_scope_block> *>(bar_storage);

if (threadIdx.x == 0) {
init(bar_ptr, blockDim.x);
}
__syncthreads();

// Should fail because the barrier is in device memory.
cuda::device::barrier_expect_tx(*bar_ptr, 1);
));
return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include "arrive_tx.h"

int main(int, char**)
{
NV_DISPATCH_TARGET(
NV_IS_HOST, (
// Required by concurrent_agents_launch to know how many we're
// launching. This can only be an int, because the nvrtc tests use grep
// to figure out how many threads to launch.
cuda_thread_count = 2;
),
NV_IS_DEVICE, (
constexpr bool split_arrive_and_expect = true;
test<split_arrive_and_expect>();
)
);

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include "arrive_tx.h"

int main(int, char**)
{
NV_DISPATCH_TARGET(
NV_IS_HOST, (
// Required by concurrent_agents_launch to know how many we're
// launching. This can only be an int, because the nvrtc tests use grep
// to figure out how many threads to launch.
cuda_thread_count = 32;
),
NV_IS_DEVICE, (
constexpr bool split_arrive_and_expect = true;
miscco marked this conversation as resolved.
Show resolved Hide resolved
test<split_arrive_and_expect>();
)
);

return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ If `!(scope == thread_block_scope && __isShared(this))`, then the semantics are
the same as [`cuda::std::barrier`]; otherwise, see below.
The `cuda::barrier` class templates extends `cuda::std::barrier` with the following additional operations:

| [`cuda::barrier::init`] | Initialize a `cuda::barrier`. `(friend function)` |
| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)` |
| [`cuda::barrier::init`] | Initialize a `cuda::barrier`. `(friend function)` |
| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)` |
| [`cuda::device::barrier_arrive_tx`] | Arrive on a `cuda::barrier<cuda::thread_scope_block>` with transaction count update. `(function)` |

| [`cuda::device::barrier_expect_tx`] | Update transaction count of `cuda::barrier<cuda::thread_scope_block>`. `(function)` |

If `scope == thread_scope_block && __isShared(this)`, then the
semantics of [[thread.barrier.class]](http://eel.is/c++draft/thread.barrier.class) of ISO/IEC
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
---
grand_parent: Extended API
parent: Barriers
---

# `cuda::device::barrier_expect_tx`

Defined in header `<cuda/barrier>`:

```cuda
__device__
void cuda::device::barrier_expect_tx(
cuda::barrier<cuda::thread_scope_block>& bar,
ptrdiff_t transaction_count_update);
```

Updates the expected transaction count of a barrier in shared memory.

## Preconditions

* `__isShared(&bar) == true`
* `0 <= transaction_count_update && transaction_count_update <= (1 << 20) - 1`

## Effects

* This function increments the expected transaction count by `transaction_count_update`.
* This function executes atomically.

## Notes

This function can only be used under CUDA Compute Capability 9.0 (Hopper) or
higher.

## Example

```cuda
#include <cuda/barrier>
#include <cuda/std/utility> // cuda::std::move

#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_expect_tx is not available.");
#endif // __CUDA_MINIMUM_ARCH__

__device__ alignas(16) int gmem_x[2048];

__global__ void example_kernel() {
using barrier_t = cuda::barrier<cuda::thread_scope_block>;
__shared__ alignas(16) int smem_x[1024];
__shared__ barrier_t bar;

if (threadIdx.x == 0) {
init(&bar, blockDim.x);
}
__syncthreads();

if (threadIdx.x == 0) {
cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
cuda::device::barrier_expect_tx(bar, sizeof(smem_x));
}
auto token = bar.arrive(1);

bar.wait(cuda::std::move(token));

// smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
smem_x[threadIdx.x] += 1;
}
```

[See it on Godbolt](https://godbolt.org/z/9Yj89P76z){: .btn }


[`cuda::thread_scope`]: ./memory_model.md
[Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object
[thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12

Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,33 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(

return async_contract_fulfillment::async;
}

_LIBCUDACXX_DEVICE inline
void barrier_expect_tx(
barrier<thread_scope_block> & __b,
_CUDA_VSTD::ptrdiff_t __transaction_count_update) {

_LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");

// We do not check for the statespace of the barrier here. This is
// on purpose. This allows debugging tools like memcheck/racecheck
// to detect that we are passing a pointer with the wrong state
// space to mbarrier.arrive. If we checked for the state space here,
// and __trap() if wrong, then those tools would not be able to help
// us in release builds. In debug builds, the error would be caught
// by the asserts at the top of this function.

auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
asm (
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
}
#endif // __CUDA_MINIMUM_ARCH__

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
Expand Down
Loading