Fix multiple definition of InclusiveScanKernel (NVIDIA#2169)

``` Linking CXX executable bin/cub.cpp14.catch2_test.lid_0 FAILED: bin/cub.cpp14.catch2_test.lid_0 ... /usr/bin/ld: cub/test/CMakeFiles/cub.cpp14.test.warp_scan_api.dir/catch2_test_warp_scan_api.cu.o: in function `InclusiveScanKernel(int*)': /usr/local/cuda-12.7/targets/x86_64-linux/include/nvtx3/nvtxDetail/nvtxInitDefs.h:473: multiple definition of `InclusiveScanKernel(int*)'; cub/test/CMakeFiles/cub.cpp14.test.block_scan_api.dir/catch2_test_block_scan_api.cu.o:/usr/local/cuda-12.7/targets/x86_64-linux/include/nvtx3/nvtxDetail/nvtxInitDefs.h:468: first defined here collect2: error: ld returned 1 exit status ```
pciolkosz · Aug 4, 2024 · 68bcf25 · 68bcf25
1 parent d3464a5
commit 68bcf25
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 8 deletions.
diff --git a/cub/test/catch2_test_block_scan_api.cu b/cub/test/catch2_test_block_scan_api.cu
@@ -39,7 +39,7 @@ constexpr int num_items_per_thread = 2;
 constexpr int block_num_threads    = 64;
 
 // example-begin inclusive-scan-array-init-value
-__global__ void InclusiveScanKernel(int* output)
+__global__ void InclusiveBlockScanKernel(int* output)
 {
   // Specialize BlockScan for a 1D block of 64 threads of type int
   using block_scan_t   = cub::BlockScan<int, 64>;
@@ -69,7 +69,7 @@ CUB_TEST("Block array-based inclusive scan works with initial value", "[scan][bl
 {
   thrust::device_vector<int> d_out(block_num_threads * num_items_per_thread);
 
-  InclusiveScanKernel<<<1, block_num_threads>>>(thrust::raw_pointer_cast(d_out.data()));
+  InclusiveBlockScanKernel<<<1, block_num_threads>>>(thrust::raw_pointer_cast(d_out.data()));
   REQUIRE(cudaSuccess == cudaPeekAtLastError());
   REQUIRE(cudaSuccess == cudaDeviceSynchronize());
 
@@ -89,7 +89,7 @@ CUB_TEST("Block array-based inclusive scan works with initial value", "[scan][bl
 }
 
 // example-begin inclusive-scan-array-aggregate-init-value
-__global__ void InclusiveScanKernelAggregate(int* output, int* d_block_aggregate)
+__global__ void InclusiveBlockScanKernelAggregate(int* output, int* d_block_aggregate)
 {
   // Specialize BlockScan for a 1D block of 64 threads of type int
   using block_scan_t   = cub::BlockScan<int, 64>;
@@ -124,7 +124,7 @@ CUB_TEST("Block array-based inclusive scan with block aggregate works with initi
   thrust::device_vector<int> d_out(block_num_threads * num_items_per_thread);
 
   c2h::device_vector<int> d_block_aggregate(1);
-  InclusiveScanKernelAggregate<<<1, block_num_threads>>>(
+  InclusiveBlockScanKernelAggregate<<<1, block_num_threads>>>(
     thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_block_aggregate.data()));
   REQUIRE(cudaSuccess == cudaPeekAtLastError());
   REQUIRE(cudaSuccess == cudaDeviceSynchronize());

diff --git a/cub/test/catch2_test_warp_scan_api.cu b/cub/test/catch2_test_warp_scan_api.cu
@@ -56,7 +56,7 @@ struct sum_op
 };
 
 // example-begin inclusive-warp-scan-init-value
-__global__ void InclusiveScanKernel(int* output)
+__global__ void InclusiveWarpScanKernel(int* output)
 {
   // Specialize WarpScan for type int
   using warp_scan_t = cub::WarpScan<int>;
@@ -89,7 +89,7 @@ CUB_TEST("Warp array-based inclusive scan works with initial value", "[scan][war
 {
   thrust::device_vector<int> d_out(num_warps * 32);
 
-  InclusiveScanKernel<<<1, num_warps * 32>>>(thrust::raw_pointer_cast(d_out.data()));
+  InclusiveWarpScanKernel<<<1, num_warps * 32>>>(thrust::raw_pointer_cast(d_out.data()));
   REQUIRE(cudaSuccess == cudaPeekAtLastError());
   REQUIRE(cudaSuccess == cudaDeviceSynchronize());
 
@@ -109,7 +109,7 @@ CUB_TEST("Warp array-based inclusive scan works with initial value", "[scan][war
 }
 
 // example-begin inclusive-warp-scan-init-value-aggregate
-__global__ void InclusiveScanKernelAggr(int* output, int* d_warp_aggregate)
+__global__ void InclusiveWarpScanKernelAggr(int* output, int* d_warp_aggregate)
 {
   // Specialize WarpScan for type int
   using warp_scan_t = cub::WarpScan<int>;
@@ -144,7 +144,7 @@ CUB_TEST("Warp array-based inclusive scan aggregate works with initial value", "
   thrust::device_vector<int> d_out(num_warps * 32);
   c2h::device_vector<int> d_warp_aggregate(num_warps);
 
-  InclusiveScanKernelAggr<<<1, num_warps * 32>>>(
+  InclusiveWarpScanKernelAggr<<<1, num_warps * 32>>>(
     thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_warp_aggregate.data()));
   REQUIRE(cudaSuccess == cudaPeekAtLastError());
   REQUIRE(cudaSuccess == cudaDeviceSynchronize());