From f8a26b21f280b6f5b9cac5e7a6fed876f5ddfc75 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Mon, 13 May 2024 15:04:56 +0200 Subject: [PATCH] PTX: make cp_async_bulk*_multicast functions sm_90a (#1734) --- .../docs/ptx/instructions/cp.async.bulk.md | 6 ++- .../ptx/instructions/cp.async.bulk.tensor.md | 14 ++++--- .../cuda/__ptx/instructions/cp_async_bulk.h | 8 ++-- .../__ptx/instructions/cp_async_bulk_tensor.h | 40 +++++++++---------- ...x.cp.async.bulk.multicast.compile.pass.cpp | 2 +- ...ync.bulk.tensor.multicast.compile.pass.cpp | 10 ++--- 6 files changed, 44 insertions(+), 36 deletions(-) diff --git a/libcudacxx/docs/ptx/instructions/cp.async.bulk.md b/libcudacxx/docs/ptx/instructions/cp.async.bulk.md index ce4b0f72de..84f188191a 100644 --- a/libcudacxx/docs/ptx/instructions/cp.async.bulk.md +++ b/libcudacxx/docs/ptx/instructions/cp.async.bulk.md @@ -15,6 +15,10 @@ **NOTE.** Both `srcMem` and `dstMem` must be 16-byte aligned, and `size` must be a multiple of 16. +## Changelog + +- In earlier versions, `cp_async_bulk_multicast` was enabled for SM_90. This has been changed to SM_90a. + ## Unicast | C++ | PTX | @@ -79,7 +83,7 @@ __device__ static inline void cp_async_bulk( ### [(0)](#0-cp_async_bulk_multicast) `cp_async_bulk_multicast` {: .no_toc } ```cuda -// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90 +// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template diff --git a/libcudacxx/docs/ptx/instructions/cp.async.bulk.tensor.md b/libcudacxx/docs/ptx/instructions/cp.async.bulk.tensor.md index 53c471485e..9b1838b23b 100644 --- a/libcudacxx/docs/ptx/instructions/cp.async.bulk.tensor.md +++ b/libcudacxx/docs/ptx/instructions/cp.async.bulk.tensor.md @@ -11,6 +11,10 @@ {:toc} +## Changelog + +- In earlier versions, `cp_async_bulk_tensor_multicast` was enabled for SM_90. This has been changed to SM_90a. + ## Unicast | C++ | PTX | @@ -194,7 +198,7 @@ __device__ static inline void cp_async_bulk_tensor( ### [(0)](#0-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast` {: .no_toc } ```cuda -// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -211,7 +215,7 @@ __device__ static inline void cp_async_bulk_tensor( ### [(1)](#1-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast` {: .no_toc } ```cuda -// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -228,7 +232,7 @@ __device__ static inline void cp_async_bulk_tensor( ### [(2)](#2-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast` {: .no_toc } ```cuda -// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -245,7 +249,7 @@ __device__ static inline void cp_async_bulk_tensor( ### [(3)](#3-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast` {: .no_toc } ```cuda -// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -262,7 +266,7 @@ __device__ static inline void cp_async_bulk_tensor( ### [(4)](#4-cp_async_bulk_tensor_multicast) `cp_async_bulk_tensor_multicast` {: .no_toc } ```cuda -// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h index 3ce19982a7..7acce21023 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h @@ -145,7 +145,7 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], -ctaMask; // 1. PTX ISA 80, SM_90 +ctaMask; // 1. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -159,7 +159,7 @@ __device__ static inline void cp_async_bulk( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__(); template _CCCL_DEVICE static inline void cp_async_bulk( space_cluster_t, @@ -173,7 +173,7 @@ _CCCL_DEVICE static inline void cp_async_bulk( // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], " "%4; // 1. " : @@ -185,7 +185,7 @@ _CCCL_DEVICE static inline void cp_async_bulk( : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h index 6e2b680900..b66981e8bb 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h @@ -450,7 +450,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90 +tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -464,7 +464,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -478,7 +478,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " "[%1, {%2}], [%3], %4; // 2a." : @@ -490,13 +490,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90 +tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -510,7 +510,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -524,7 +524,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " "[%1, {%2, %3}], [%4], %5; // 2b." : @@ -537,13 +537,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90 +tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -557,7 +557,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -571,7 +571,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " "[%1, {%2, %3, %4}], [%5], %6; // 2c." : @@ -585,13 +585,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90 +tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -605,7 +605,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -619,7 +619,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d." : @@ -634,13 +634,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90 +tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } template @@ -654,7 +654,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -668,7 +668,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e." : @@ -684,7 +684,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp index 1099382319..85d6e64993 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp @@ -38,7 +38,7 @@ __global__ void test_cp_async_bulk_multicast(void** fn_ptr) { #if __cccl_ptx_isa >= 800 NV_IF_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], // size, [smem_bar], ctaMask; // 1. diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp index 199e7650af..b7a597d50a 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp @@ -38,7 +38,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) { #if __cccl_ptx_isa >= 800 NV_IF_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. @@ -54,7 +54,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) #if __cccl_ptx_isa >= 800 NV_IF_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. @@ -70,7 +70,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) #if __cccl_ptx_isa >= 800 NV_IF_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. @@ -86,7 +86,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) #if __cccl_ptx_isa >= 800 NV_IF_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. @@ -102,7 +102,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) #if __cccl_ptx_isa >= 800 NV_IF_TARGET( - NV_PROVIDES_SM_90, + NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e.