Add PTX wrappers

NVIDIA · Aug 28, 2023 · 368ceda · 368ceda
1 parent 567562d
commit 368ceda
Show file tree

Hide file tree

Showing 2 changed files with 178 additions and 0 deletions.
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -1482,6 +1482,8 @@ typedef __char32_t char32_t;
 #define _LIBCUDACXX_END_NAMESPACE_CUDA  } }
 #define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE namespace cuda { namespace device { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
 #define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE  } } }
+#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_EXPERIMENTAL namespace cuda { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
+#define _LIBCUDACXX_END_NAMESPACE_CUDA_EXPERIMENTAL  } } }
 #endif
 
 // Inline namespaces are available in Clang/GCC/MSVC regardless of C++ dialect.

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
@@ -857,4 +857,180 @@ async_contract_fulfillment memcpy_async(void * __destination, void const * __sou
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_EXPERIMENTAL
+
+// - cp_async_bulk_global_to_shared
+// - cp_async_bulk_shared_to_global
+// - cp_async_bulk_tensor_{1,2,3,4,5}d_global_to_shared
+// - cp_async_bulk_tensor_{1,2,3,4,5}d_shared_to_global
+// - fence_proxy_async_shared_cta
+// - cp_async_bulk_commit_group
+// - cp_async_bulk_wait_group_read<0, …, 7>
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_global_to_shared(void *dest, const void *src, uint32_t size, cuda::barrier<cuda::thread_scope_block> &bar)
+{
+#if (_LIBCUDACXX_DEBUG_LEVEL >= 2)
+    _LIBCUDACXX_DEBUG_ASSERT(size % 16 == 0);
+    _LIBCUDACXX_DEBUG_ASSERT(__isShared(dest));
+    _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(src));
+#endif
+
+  asm volatile(
+    "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
+    :
+    : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(dest))),
+      "l"(static_cast<uint64_t>(__cvta_generic_to_global(src))),
+      "r"(size),
+      "r"(static_cast<uint32_t>(__cvta_generic_to_shared(cuda::device::barrier_native_handle(bar))))
+    : "memory");
+}
+
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_shared_to_global(void *dest, const void * src, uint32_t size)
+{
+  asm volatile(
+    "cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;\n"
+    :
+    : "l"(static_cast<uint64_t>(__cvta_generic_to_global(dest))),
+      "r"(static_cast<uint32_t>(__cvta_generic_to_shared(src))),
+      "r"(size)
+    : "memory");
+}
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_tensor_1d_global_to_shared(
+    void *dest, const void *tensor_map , int c0, cuda::barrier<cuda::thread_scope_block> &bar)
+{
+  asm volatile(
+    "cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
+    "[%0], [%1, {%2}], [%3];\n"
+    :
+    : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(dest))),
+      "l"(tensor_map),
+      "r"(c0),
+      "r"(static_cast<uint32_t>(__cvta_generic_to_shared(cuda::device::barrier_native_handle(bar))))
+    : "memory");
+}
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_tensor_2d_global_to_shared(
+    void *dest, const void *tensor_map , int c0, int c1, cuda::barrier<cuda::thread_scope_block> &bar)
+{
+  asm volatile(
+    "cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
+    "[%0], [%1, {%2, %3}], [%4];\n"
+    :
+    : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(dest))),
+      "l"(tensor_map),
+      "r"(c0),
+      "r"(c1),
+      "r"(static_cast<uint32_t>(__cvta_generic_to_shared(cuda::device::barrier_native_handle(bar))))
+    : "memory");
+}
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_tensor_3d_global_to_shared(
+    void *dest, const void *tensor_map, int c0, int c1, int c2, cuda::barrier<cuda::thread_scope_block> &bar)
+{
+  asm volatile(
+    "cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
+    "[%0], [%1, {%2, %3, %4}], [%5];\n"
+    :
+    : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(dest))),
+      "l"(tensor_map),
+      "r"(c0),
+      "r"(c1),
+      "r"(c2),
+      "r"(static_cast<uint32_t>(__cvta_generic_to_shared(cuda::device::barrier_native_handle(bar))))
+    : "memory");
+}
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_tensor_4d_global_to_shared(
+    void *dest, const void *tensor_map , int c0, int c1, int c2, int c3, cuda::barrier<cuda::thread_scope_block> &bar)
+{
+  asm volatile(
+    "cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
+    "[%0], [%1, {%2, %3, %4, %5}], [%6];\n"
+    :
+    : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(dest))),
+      "l"(tensor_map),
+      "r"(c0),
+      "r"(c1),
+      "r"(c2),
+      "r"(c3),
+      "r"(static_cast<uint32_t>(__cvta_generic_to_shared(cuda::device::barrier_native_handle(bar))))
+    : "memory");
+}
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_tensor_2d_global_to_shared(
+    void *dest, const void *tensor_map , int c0, int c1, int c2, int c3, int c4, cuda::barrier<cuda::thread_scope_block> &bar)
+{
+  asm volatile(
+    "cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
+    "[%0], [%1, {%2, %3, %4, %5, %6}], [%7];\n"
+    :
+    : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(dest))),
+      "l"(tensor_map),
+      "r"(c0),
+      "r"(c1),
+      "r"(c2),
+      "r"(c3),
+      "r"(c4),
+      "r"(static_cast<uint32_t>(__cvta_generic_to_shared(cuda::device::barrier_native_handle(bar))))
+    : "memory");
+}
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_tensor_2d_shared_to_global(
+    const void *tensor_map, int x, int y, const void *src)
+{
+  asm volatile(
+    "cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group "
+    "[%0, {%1, %2}], [%3];\n"
+    :
+    : "l"(tensor_map),
+      "r"(x),
+      "r"(y),
+      "r"(static_cast<uint32_t>(__cvta_generic_to_shared(src)))
+    : "memory");
+}
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
+_LIBCUDACXX_DEVICE
+void fence_proxy_async_shared_cta() {
+  asm volatile("fence.proxy.async.shared::cta; \n":::"memory");
+}
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_commit_group()
+{
+  asm volatile("cp.async.bulk.commit_group;\n" ::: "memory");
+}
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
+template <int n_prior>
+_LIBCUDACXX_DEVICE
+void cp_async_bulk_wait_group_read()
+{
+  static_assert(n_prior <= 8, "cp_async_bulk_wait_group_read: waiting for more than 8 groups is not supported.");
+  asm volatile("cp.async.bulk.wait_group.read %0; \n"
+               :
+               : "n"(n_prior));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_EXPERIMENTAL
+
 #endif // _LIBCUDACXX___CUDA_BARRIER_H