NVIDIA · miscco · Oct 17, 2023 · Sep 27, 2023 · Oct 16, 2023 · Oct 16, 2023
@@ -46,36 +46,45 @@ void mbarrier_complete_tx(
   );
 }
 
-template<typename Barrier>
+template<bool split_arrive_and_expect>
 __device__
-void thread(Barrier& b, int arrives_per_thread)
+void thread(cuda::barrier<cuda::thread_scope_block>& b, int arrives_per_thread)
 {
   constexpr int tx_count = 1;
-  auto tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count);
+  typename cuda::barrier<cuda::thread_scope_block>::arrival_token tok;
+
+  if _LIBCUDACXX_CONSTEXPR_AFTER_CXX17 (split_arrive_and_expect) {
+    cuda::device::barrier_expect_tx(b, tx_count);
+    tok = b.arrive(arrives_per_thread);
+  } else{
+    tok = cuda::device::barrier_arrive_tx(b, arrives_per_thread, tx_count);
+  }
+
   // Manually increase the transaction count of the barrier.
   mbarrier_complete_tx(b, tx_count);
 
   b.wait(cuda::std::move(tok));
 }
 
+template<bool split_arrive_and_expect>
 __device__
 void test()
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE, (
       // Run all threads, each arriving with arrival count 1
-      constexpr auto block = cuda::thread_scope_block;
+      using barrier_t = cuda::barrier<cuda::thread_scope_block>;
 
-      __shared__ cuda::barrier<block> bar_1;
-      init(&bar_1, (int) blockDim.x);
+      shared_memory_selector<barrier_t, constructor_initializer> sel_1;
+      barrier_t* bar_1 = sel_1.construct(blockDim.x);
       __syncthreads();
-      thread(bar_1, 1);
+      thread<split_arrive_and_expect>(*bar_1, 1);
 
       // Run all threads, each arriving with arrival count 2
-      __shared__ cuda::barrier<block> bar_2;
-      init(&bar_2, (int) 2 * blockDim.x);
+      shared_memory_selector<barrier_t, constructor_initializer> sel_2;
+      barrier_t* bar_2 = sel_2.construct(2 * blockDim.x);
       __syncthreads();
-      thread(bar_2, 2);
+      thread<split_arrive_and_expect>(*bar_2, 2);
     )
   );
 }

@@ -25,7 +25,8 @@ int main(int, char**)
         cuda_thread_count = 256;
         ),
         NV_IS_DEVICE, (
-            test();
+            constexpr bool split_arrive_and_expect = false;
+            test<split_arrive_and_expect>();
         )
     );
 

@@ -25,7 +25,8 @@ int main(int, char**)
         cuda_thread_count = 2;
         ),
         NV_IS_DEVICE, (
-            test();
+            constexpr bool split_arrive_and_expect = false;
+            test<split_arrive_and_expect>();
         )
     );
 

@@ -25,7 +25,8 @@ int main(int, char**)
         cuda_thread_count = 32;
         ),
         NV_IS_DEVICE, (
-            test();
+            constexpr bool split_arrive_and_expect = false;
+            test<split_arrive_and_expect>();
         )
     );
 

@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include "arrive_tx.h"
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+        // Required by concurrent_agents_launch to know how many we're
+        // launching. This can only be an int, because the nvrtc tests use grep
+        // to figure out how many threads to launch.
+        cuda_thread_count = 256;
+        ),
+        NV_IS_DEVICE, (
+            constexpr bool split_arrive_and_expect = true;
+            test<split_arrive_and_expect>();
+        )
+    );
+
+    return 0;
+}
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include <cuda/barrier>
+#include "test_macros.h"
+
+// Suppress warning about barrier in shared memory
+TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
+
+__device__ uint64_t bar_storage;
+
+int main(int, char**){
+    NV_IF_TARGET(
+        NV_IS_DEVICE, (
+            cuda::barrier<cuda::thread_scope_block> *bar_ptr;
+            bar_ptr = reinterpret_cast<cuda::barrier<cuda::thread_scope_block> *>(bar_storage);
+
+            if (threadIdx.x == 0) {
+                init(bar_ptr, blockDim.x);
+            }
+            __syncthreads();
+
+            // Should fail because the barrier is in device memory.
+            cuda::device::barrier_expect_tx(*bar_ptr, 1);
+    ));
+    return 0;
+}
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include "arrive_tx.h"
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+        // Required by concurrent_agents_launch to know how many we're
+        // launching. This can only be an int, because the nvrtc tests use grep
+        // to figure out how many threads to launch.
+        cuda_thread_count = 2;
+        ),
+        NV_IS_DEVICE, (
+            constexpr bool split_arrive_and_expect = true;
+            test<split_arrive_and_expect>();
+        )
+    );
+
+    return 0;
+}
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include "arrive_tx.h"
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+        // Required by concurrent_agents_launch to know how many we're
+        // launching. This can only be an int, because the nvrtc tests use grep
+        // to figure out how many threads to launch.
+        cuda_thread_count = 32;
+        ),
+        NV_IS_DEVICE, (
+            constexpr bool split_arrive_and_expect = true;
+            test<split_arrive_and_expect>();
+        )
+    );
+
+    return 0;
+}
@@ -20,10 +20,10 @@ If `!(scope == thread_block_scope && __isShared(this))`, then the semantics are
 the same as [`cuda::std::barrier`]; otherwise, see below. 
 The `cuda::barrier` class templates extends `cuda::std::barrier` with the following additional operations:
 
-| [`cuda::barrier::init`]                 | Initialize a `cuda::barrier`. `(friend function)`                       |
-| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)`                |
+| [`cuda::barrier::init`]                 | Initialize a `cuda::barrier`. `(friend function)`                                                 |
+| [`cuda::device::barrier_native_handle`] | Get the native handle to a `cuda::barrier`. `(function)`                                          |
 | [`cuda::device::barrier_arrive_tx`]     | Arrive on a `cuda::barrier<cuda::thread_scope_block>` with transaction count update. `(function)` |
-
+| [`cuda::device::barrier_expect_tx`]     | Update transaction count of `cuda::barrier<cuda::thread_scope_block>`. `(function)`               |
 
 If `scope == thread_scope_block && __isShared(this)`, then the
 semantics of [[thread.barrier.class]](http://eel.is/c++draft/thread.barrier.class) of ISO/IEC

@@ -0,0 +1,75 @@
+---
+grand_parent: Extended API
+parent: Barriers
+---
+
+# `cuda::device::barrier_expect_tx`
+
+Defined in header `<cuda/barrier>`:
+
+```cuda
+__device__
+void cuda::device::barrier_expect_tx(
+  cuda::barrier<cuda::thread_scope_block>& bar,
+  ptrdiff_t transaction_count_update);
+```
+
+Updates the expected transaction count of a barrier in shared memory.
+
+## Preconditions
+
+* `__isShared(&bar) == true`
+* `0 <= transaction_count_update && transaction_count_update <= (1 << 20) - 1`
+
+## Effects
+
+* This function increments the expected transaction count by `transaction_count_update`.
+* This function executes atomically.
+
+## Notes
+
+This function can only be used under CUDA Compute Capability 9.0 (Hopper) or
+higher.
+
+## Example
+
+```cuda
+#include <cuda/barrier>
+#include <cuda/std/utility> // cuda::std::move
+
+#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
+static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy_expect_tx is not available.");
+#endif // __CUDA_MINIMUM_ARCH__
+
+__device__ alignas(16) int gmem_x[2048];
+
+__global__ void example_kernel() {
+    using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+  __shared__ alignas(16) int smem_x[1024];
+  __shared__ barrier_t bar;
+
+  if (threadIdx.x == 0) {
+    init(&bar, blockDim.x);
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
+    cuda::device::barrier_expect_tx(bar, sizeof(smem_x));
+  } 
+  auto token = bar.arrive(1);
+
+  bar.wait(cuda::std::move(token));
+
+  // smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
+  smem_x[threadIdx.x] += 1;
+}
+```
+
+[See it on Godbolt](https://godbolt.org/z/9Yj89P76z){: .btn }
+
+
+[`cuda::thread_scope`]: ./memory_model.md
+[Tracking asynchronous operations by the mbarrier object]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tracking-asynchronous-operations-by-the-mbarrier-object
+[thread.barrier.class paragraph 12]: https://eel.is/c++draft/thread.barrier.class#12
+
@@ -652,6 +652,33 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
 
     return async_contract_fulfillment::async;
 }
+
+_LIBCUDACXX_DEVICE inline
+void barrier_expect_tx(
+    barrier<thread_scope_block> & __b,
+    _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
+
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
+    _LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
+
+    // We do not check for the statespace of the barrier here. This is
+    // on purpose. This allows debugging tools like memcheck/racecheck
+    // to detect that we are passing a pointer with the wrong state
+    // space to mbarrier.arrive. If we checked for the state space here,
+    // and __trap() if wrong, then those tools would not be able to help
+    // us in release builds. In debug builds, the error would be caught
+    // by the asserts at the top of this function.
+
+    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
+    asm (
+        "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
+        :
+        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+          "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
+        : "memory");
+}
 #endif // __CUDA_MINIMUM_ARCH__
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE