From f8a26b21f280b6f5b9cac5e7a6fed876f5ddfc75 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 13 May 2024 15:04:56 +0200
Subject: [PATCH] PTX: make cp_async_bulk*_multicast functions sm_90a (#1734)

---
 .../docs/ptx/instructions/cp.async.bulk.md    |  6 ++-
 .../ptx/instructions/cp.async.bulk.tensor.md  | 14 ++++---
 .../cuda/__ptx/instructions/cp_async_bulk.h   |  8 ++--
 .../__ptx/instructions/cp_async_bulk_tensor.h | 40 +++++++++----------
 ...x.cp.async.bulk.multicast.compile.pass.cpp |  2 +-
 ...ync.bulk.tensor.multicast.compile.pass.cpp | 10 ++---
 6 files changed, 44 insertions(+), 36 deletions(-)
diff --git a/libcudacxx/docs/ptx/instructions/cp.async.bulk.md b/libcudacxx/docs/ptx/instructions/cp.async.bulk.md
index ce4b0f72de..84f188191a 100644
--- a/libcudacxx/docs/ptx/instructions/cp.async.bulk.md
+++ b/libcudacxx/docs/ptx/instructions/cp.async.bulk.md
@@ -15,6 +15,10 @@
 
 **NOTE.** Both `srcMem` and `dstMem` must be 16-byte aligned, and `size` must be a multiple of 16.
 
+## Changelog
+
+- In earlier versions, `cp_async_bulk_multicast` was enabled for SM_90. This has been changed to SM_90a.
+
 ## Unicast
 
 | C++ | PTX |
@@ -79,7 +83,7 @@ __device__ static inline void cp_async_bulk(
 ### [(0)](#0-cp_async_bulk_multicast) `cp_async_bulk_multicast`
 {: .no_toc }
 ```cuda
-// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90
+// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
diff --git a/libcudacxx/docs/ptx/instructions/cp.async.bulk.tensor.md b/libcudacxx/docs/ptx/instructions/cp.async.bulk.tensor.md
index 53c471485e..9b1838b23b 100644
--- a/libcudacxx/docs/ptx/instructions/cp.async.bulk.tensor.md
+++ b/libcudacxx/docs/ptx/instructions/cp.async.bulk.tensor.md
@@ -11,6 +11,10 @@
 {:toc}
 </details>
 
+## Changelog
+
+- In earlier versions, `cp_async_bulk_tensor_multicast` was enabled for SM_90. This has been changed to SM_90a.
+
 ## Unicast
 
 | C++ | PTX |
@@ -194,7 +198,7 @@ __device__ static inline void cp_async_bulk_tensor(
 ### [(0)](#0-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
 {: .no_toc }
 ```cuda
-// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90
+// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -211,7 +215,7 @@ __device__ static inline void cp_async_bulk_tensor(
 ### [(1)](#1-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
 {: .no_toc }
 ```cuda
-// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90
+// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -228,7 +232,7 @@ __device__ static inline void cp_async_bulk_tensor(
 ### [(2)](#2-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
 {: .no_toc }
 ```cuda
-// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90
+// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -245,7 +249,7 @@ __device__ static inline void cp_async_bulk_tensor(
 ### [(3)](#3-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
 {: .no_toc }
 ```cuda
-// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90
+// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -262,7 +266,7 @@ __device__ static inline void cp_async_bulk_tensor(
 ### [(4)](#4-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast`
 {: .no_toc }
 ```cuda
-// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90
+// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
index 3ce19982a7..7acce21023 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
@@ -145,7 +145,7 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM
 #endif // __cccl_ptx_isa >= 800
 /*
 // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
-ctaMask; // 1.  PTX ISA 80, SM_90
+ctaMask; // 1.  PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -159,7 +159,7 @@ __device__ static inline void cp_async_bulk(
   const uint16_t& ctaMask);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_async_bulk(
   space_cluster_t,
@@ -173,7 +173,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
   // __space == space_cluster (due to parameter type constraint)
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], "
          "%4; // 1. "
          :
@@ -185,7 +185,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
          : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
+      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
index 6e2b680900..b66981e8bb 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
@@ -450,7 +450,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 #endif // __cccl_ptx_isa >= 800
 /*
 // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90
+tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -464,7 +464,7 @@ __device__ static inline void cp_async_bulk_tensor(
   const uint16_t& ctaMask);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   space_cluster_t,
@@ -478,7 +478,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_cluster (due to parameter type constraint)
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
          "[%1, {%2}], [%3], %4; // 2a."
          :
@@ -490,13 +490,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
          : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90
+tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -510,7 +510,7 @@ __device__ static inline void cp_async_bulk_tensor(
   const uint16_t& ctaMask);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   space_cluster_t,
@@ -524,7 +524,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_cluster (due to parameter type constraint)
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
          "[%1, {%2, %3}], [%4], %5; // 2b."
          :
@@ -537,13 +537,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
          : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90
+tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -557,7 +557,7 @@ __device__ static inline void cp_async_bulk_tensor(
   const uint16_t& ctaMask);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   space_cluster_t,
@@ -571,7 +571,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_cluster (due to parameter type constraint)
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
          "[%1, {%2, %3, %4}], [%5], %6; // 2c."
          :
@@ -585,13 +585,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
          : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90
+tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -605,7 +605,7 @@ __device__ static inline void cp_async_bulk_tensor(
   const uint16_t& ctaMask);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   space_cluster_t,
@@ -619,7 +619,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_cluster (due to parameter type constraint)
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
          "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d."
          :
@@ -634,13 +634,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
          : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90
+tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
 template <typename=void>
@@ -654,7 +654,7 @@ __device__ static inline void cp_async_bulk_tensor(
   const uint16_t& ctaMask);
 */
 #if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   space_cluster_t,
@@ -668,7 +668,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_cluster (due to parameter type constraint)
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
          "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e."
          :
@@ -684,7 +684,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
          : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
 
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
index 1099382319..85d6e64993 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
@@ -38,7 +38,7 @@ __global__ void test_cp_async_bulk_multicast(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (
         // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem],
         // size, [smem_bar], ctaMask; // 1.
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
index 199e7650af..b7a597d50a 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
@@ -38,7 +38,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (
         // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
         // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a.
@@ -54,7 +54,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
 
 #if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (
         // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
         // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b.
@@ -70,7 +70,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
 
 #if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (
         // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
         // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c.
@@ -86,7 +86,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
 
 #if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (
         // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
         // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d.
@@ -102,7 +102,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
 
 #if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
+    NV_HAS_FEATURE_SM_90a,
     (
         // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
         // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e.