From 51c1b22cfa7fb2265706a77010a7863d237129d7 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 19 Aug 2024 13:19:03 +0200 Subject: [PATCH 01/71] Workaround GCC 13 issue with empty histogram decoder op (#2252) * Workaround GCC 13 issue * Update cub/cub/device/dispatch/dispatch_histogram.cuh Co-authored-by: Michael Schellenberger Costa --------- Co-authored-by: Michael Schellenberger Costa --- cub/cub/device/dispatch/dispatch_histogram.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index aa8cc2f5c0..1019569240 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -847,7 +847,7 @@ public: { // GCC 14 rightfully warns that when a value-initialized array of this struct is copied using memcpy, uninitialized // bytes may be accessed. To avoid this, we add a dummy member, so value initialization actually initializes the memory. -#if defined(_CCCL_COMPILER_GCC) && __GNUC__ == 14 +#if defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 13 char dummy; #endif From da9b7ddf803da036d342254a5336ae9c6b70d889 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 20 Aug 2024 10:20:10 +0200 Subject: [PATCH 02/71] Refactor Thrust's logical meta functions (#2260) --- docs/thrust/api_docs/utility/type_traits.rst | 3 - .../include/cuda/std/__type_traits/negation.h | 4 +- .../type_traits/logical_metafunctions.h | 297 ++++-------------- 3 files changed, 55 insertions(+), 249 deletions(-) diff --git a/docs/thrust/api_docs/utility/type_traits.rst b/docs/thrust/api_docs/utility/type_traits.rst index 3351d27151..1ce609ffb3 100644 --- a/docs/thrust/api_docs/utility/type_traits.rst +++ b/docs/thrust/api_docs/utility/type_traits.rst @@ -8,7 +8,4 @@ Type Traits :maxdepth: 1 ${repo_docs_api_path}/*struct*proclaim__contiguous__iterator* - ${repo_docs_api_path}/*struct*conjunction* - ${repo_docs_api_path}/*struct*disjunction* - ${repo_docs_api_path}/*struct*negation* ${repo_docs_api_path}/*typedef_group__type__traits* diff --git a/libcudacxx/include/cuda/std/__type_traits/negation.h b/libcudacxx/include/cuda/std/__type_traits/negation.h index 547b38be42..cd01236f9c 100644 --- a/libcudacxx/include/cuda/std/__type_traits/negation.h +++ b/libcudacxx/include/cuda/std/__type_traits/negation.h @@ -28,13 +28,13 @@ template struct _Not : bool_constant {}; -#if _CCCL_STD_VER > 2011 template struct negation : _Not<_Tp> {}; +#if _CCCL_STD_VER >= 2014 template _LIBCUDACXX_INLINE_VAR constexpr bool negation_v = !_Tp::value; -#endif // _CCCL_STD_VER > 2014 +#endif // _CCCL_STD_VER >= 2014 _LIBCUDACXX_END_NAMESPACE_STD diff --git a/thrust/thrust/type_traits/logical_metafunctions.h b/thrust/thrust/type_traits/logical_metafunctions.h index 9f034731a8..1be7bbd06a 100644 --- a/thrust/thrust/type_traits/logical_metafunctions.h +++ b/thrust/thrust/type_traits/logical_metafunctions.h @@ -14,14 +14,6 @@ * limitations under the License. */ -/*! \file - * \brief C++17's - * std::conjunction, - * std::disjunction, - * and std::negation - * metafunctions and related extensions. - */ - #pragma once #include @@ -34,267 +26,84 @@ # pragma system_header #endif // no system header -#include +#include +#include +#include THRUST_NAMESPACE_BEGIN -/*! \addtogroup utility - * \{ - */ - -/*! \addtogroup type_traits Type Traits - * \{ - */ - -/*! \brief std::integral_constant - * whose value is (... && Ts::value). - * - * \see conjunction_v - * \see conjunction_value - * \see std::conjunction - */ -#if _CCCL_STD_VER >= 2017 -template -using conjunction = std::conjunction; -#else // Older than C++17. -template -struct conjunction; - -/*! \cond - */ - -template <> -struct conjunction<> : std::true_type -{}; - -template -struct conjunction : T -{}; - -template -struct conjunction : std::conditional::type -{}; - -template -struct conjunction : std::conditional, T0>::type -{}; - -/*! \endcond - */ -#endif - -/*! \brief constexpr bool whose value is (... && Ts::value). - * - * \see conjunction - * \see conjunction_value - * \see std::conjunction - */ -#if _CCCL_STD_VER >= 2014 -template -constexpr bool conjunction_v = conjunction::value; -#endif - -/*! \brief std::integral_constant - * whose value is (... || Ts::value). - * - * \see disjunction_v - * \see disjunction_value - * \see std::disjunction - */ -#if _CCCL_STD_VER >= 2017 -template -using disjunction = std::disjunction; -#else // Older than C++17. -template -struct disjunction; - -/*! \cond - */ - -template <> -struct disjunction<> : std::false_type -{}; - -template -struct disjunction : T -{}; - -template -struct disjunction : std::conditional>::type -{}; - -/*! \endcond - */ -#endif - -/*! \brief constexpr bool whose value is (... || Ts::value). - * - * \see disjunction - * \see disjunction_value - * \see std::disjunction - */ -#if _CCCL_STD_VER >= 2014 -template -constexpr bool disjunction_v = disjunction::value; -#endif - -/*! \brief std::integral_constant - * whose value is !Ts::value. - * - * \see negation_v - * \see negation_value - * \see std::negation - */ -#if _CCCL_STD_VER >= 2017 -template -using negation = std::negation; -#else // Older than C++17. -template -struct negation; - -/*! \cond - */ - -template -struct negation : std::integral_constant -{}; - -/*! \endcond - */ -#endif - -/*! \brief constexpr bool whose value is !Ts::value. - * - * \see negation - * \see negation_value - * \see std::negation - */ -#if _CCCL_STD_VER >= 2014 -template -constexpr bool negation_v = negation::value; +//! \addtogroup utility +//! \{ +//! \addtogroup type_traits Type Traits +//! \{ + +using ::cuda::std::conjunction; +using ::cuda::std::disjunction; +using ::cuda::std::negation; +#if _CCCL_STD_VER >= 2014 && !defined(_LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES) +using ::cuda::std::conjunction_v; +using ::cuda::std::disjunction_v; +using ::cuda::std::negation_v; #endif -/////////////////////////////////////////////////////////////////////////////// - -/*! \brief std::integral_constant - * whose value is (... && Bs). - * - * \see conjunction_value_v - * \see conjunction - * \see std::conjunction - */ +//! \brief std::integral_constant +//! whose value is (... && Bs). +//! +//! \see conjunction_value_v +//! \see conjunction +//! \see std::conjunction template -struct conjunction_value; +using conjunction_value = conjunction<::cuda::std::bool_constant...>; #if _CCCL_STD_VER >= 2014 -/*! \brief constexpr bool whose value is (... && Bs). - * - * \see conjunction_value - * \see conjunction - * \see std::conjunction - */ +//! \brief constexpr bool whose value is (... && Bs). +//! +//! \see conjunction_value +//! \see conjunction +//! \see std::conjunction template constexpr bool conjunction_value_v = conjunction_value::value; #endif -/*! \cond - */ - -template <> -struct conjunction_value<> : std::true_type -{}; - -template -struct conjunction_value : std::integral_constant -{}; - -template -struct conjunction_value : std::integral_constant::value> -{}; - -/*! \endcond - */ - -/////////////////////////////////////////////////////////////////////////////// - -/*! \brief std::integral_constant - * whose value is (... || Bs). - * - * \see disjunction_value_v - * \see disjunction - * \see std::disjunction - */ +//! \brief std::integral_constant +//! whose value is (... || Bs). +//! +//! \see disjunction_value_v +//! \see disjunction +//! \see std::disjunction template -struct disjunction_value; +using disjunction_value = disjunction<::cuda::std::bool_constant...>; #if _CCCL_STD_VER >= 2014 -/*! \brief constexpr bool whose value is (... || Bs). - * - * \see disjunction_value - * \see disjunction - * \see std::disjunction - */ +//! \brief constexpr bool whose value is (... || Bs). +//! +//! \see disjunction_value +//! \see disjunction +//! \see std::disjunction template constexpr bool disjunction_value_v = disjunction_value::value; #endif -/*! \cond - */ - -template <> -struct disjunction_value<> : std::false_type -{}; - -template -struct disjunction_value : std::integral_constant -{}; - -template -struct disjunction_value : std::integral_constant::value> -{}; - -/*! \endcond - */ - -/////////////////////////////////////////////////////////////////////////////// - -/*! \brief std::integral_constant - * whose value is !Bs. - * - * \see negation_value_v - * \see negation - * \see std::negation - */ +//! \brief std::integral_constant +//! whose value is !Bs. +//! +//! \see negation_value_v +//! \see negation +//! \see std::negation template -struct negation_value; +using negation_value = ::cuda::std::bool_constant; #if _CCCL_STD_VER >= 2014 -/*! \brief constexpr bool whose value is !Ts::value. - * - * \see negation_value - * \see negation - * \see std::negation - */ +//! \brief constexpr bool whose value is !Ts::value. +//! +//! \see negation_value +//! \see negation +//! \see std::negation template constexpr bool negation_value_v = negation_value::value; #endif -/*! \cond - */ - -template -struct negation_value : std::integral_constant -{}; - -/*! \endcond - */ - -/////////////////////////////////////////////////////////////////////////////// - -/*! \} // type traits - */ - -/*! \} // utility - */ +//! \} // type traits +//! \} // utility THRUST_NAMESPACE_END From f871aebcc4fdc0aa3cf6d0b7bb4e80f4778a7fc3 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 20 Aug 2024 10:26:04 +0200 Subject: [PATCH 03/71] Fix use of doxygen \file command (#2259) --- cub/cub/block/block_adjacent_difference.cuh | 5 +++-- cub/cub/block/block_load.cuh | 3 ++- cub/cub/block/block_radix_rank.cuh | 3 ++- cub/cub/block/block_reduce.cuh | 5 +++-- cub/cub/block/block_scan.cuh | 5 +++-- cub/cub/block/block_shuffle.cuh | 5 +++-- cub/cub/block/block_store.cuh | 3 ++- cub/cub/device/device_copy.cuh | 3 ++- cub/cub/device/device_histogram.cuh | 6 +++--- cub/cub/device/device_memcpy.cuh | 3 ++- cub/cub/device/device_partition.cuh | 5 +++-- cub/cub/device/device_radix_sort.cuh | 6 +++--- cub/cub/device/device_reduce.cuh | 6 +++--- cub/cub/device/device_run_length_encode.cuh | 6 +++--- cub/cub/device/device_scan.cuh | 5 +++-- cub/cub/device/device_segmented_radix_sort.cuh | 5 +++-- cub/cub/device/device_segmented_reduce.cuh | 6 +++--- cub/cub/device/device_segmented_sort.cuh | 6 +++--- cub/cub/device/device_select.cuh | 6 +++--- cub/cub/device/device_spmv.cuh | 5 +++-- cub/cub/util_cpp_dialect.cuh | 3 ++- cub/cub/warp/warp_load.cuh | 3 ++- cub/cub/warp/warp_store.cuh | 3 ++- cub/test/catch2_main.cuh | 8 ++++---- cub/test/catch2_runner.cpp | 5 +++-- cub/test/catch2_runner_helper.cu | 8 ++++---- cub/test/catch2_runner_helper.inl | 8 ++++---- cub/test/catch2_test_launch_helper.h | 3 ++- .../cuda/experimental/__container/uninitialized_buffer.h | 3 ++- libcudacxx/include/cuda/memory_resource | 3 ++- thrust/thrust/system/cpp/pointer.h | 5 ++--- 31 files changed, 84 insertions(+), 65 deletions(-) diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh index 709e9c1bd0..5bc3bae321 100644 --- a/cub/cub/block/block_adjacent_difference.cuh +++ b/cub/cub/block/block_adjacent_difference.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockAdjacentDifference class provides collective methods for computing -//! the differences of adjacent elements partitioned across a CUDA thread block. +//! @file +//! The cub::BlockAdjacentDifference class provides collective methods for computing the differences of adjacent +//! elements partitioned across a CUDA thread block. #pragma once diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index 76c073f1b5..284ac4401e 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -26,7 +26,8 @@ * ******************************************************************************/ -//! @file block_load.cuh Operations for reading linear tiles of data into the CUDA thread block. +//! @file +//! block_load.cuh Operations for reading linear tiles of data into the CUDA thread block. #pragma once diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index c91731ae03..21a4879192 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -26,7 +26,8 @@ * ******************************************************************************/ -//! @file cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block +//! @file +//! cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block #pragma once diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh index d35c90c06d..12c97ee5b8 100644 --- a/cub/cub/block/block_reduce.cuh +++ b/cub/cub/block/block_reduce.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockReduce class provides :ref:`collective ` methods for computing -//! a parallel reduction of items partitioned across a CUDA thread block. +//! @file +//! The cub::BlockReduce class provides :ref:`collective ` methods for computing a parallel +//! reduction of items partitioned across a CUDA thread block. #pragma once diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index df7ab6e814..afc4df76d7 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockScan class provides :ref:`collective ` methods for computing a -//! parallel prefix sum/scan of items partitioned across a CUDA thread block. +//! @file +//! The cub::BlockScan class provides :ref:`collective ` methods for computing a parallel prefix +//! sum/scan of items partitioned across a CUDA thread block. #pragma once diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh index 048c6e3a8e..a3dedcc3c7 100644 --- a/cub/cub/block/block_shuffle.cuh +++ b/cub/cub/block/block_shuffle.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file The cub::BlockShuffle class provides :ref:`collective ` methods for shuffling -//! data partitioned across a CUDA thread block. +//! @file +//! The cub::BlockShuffle class provides :ref:`collective ` methods for shuffling data +//! partitioned across a CUDA thread block. #pragma once diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index 6c9f4f57a8..9d057d7fe4 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -26,7 +26,8 @@ * ******************************************************************************/ -//! @file Operations for writing linear segments of data from the CUDA thread block +//! @file +//! Operations for writing linear segments of data from the CUDA thread block #pragma once diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh index 0d222475b2..a6d24a5221 100644 --- a/cub/cub/device/device_copy.cuh +++ b/cub/cub/device/device_copy.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file cub::DeviceCopy provides device-wide, parallel operations for copying data. +//! @file +//! cub::DeviceCopy provides device-wide, parallel operations for copying data. #pragma once diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index 46f4bee557..e6abc4bd07 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceHistogram provides device-wide parallel operations for -//! constructing histogram(s) from a sequence of samples data residing -//! within device-accessible memory. +//! @file +//! cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of +//! samples data residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh index 1359863a76..e71431cb74 100644 --- a/cub/cub/device/device_memcpy.cuh +++ b/cub/cub/device/device_memcpy.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file cub::DeviceMemcpy provides device-wide, parallel operations for copying data. +//! @file +//! cub::DeviceMemcpy provides device-wide, parallel operations for copying data. #pragma once diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index 08a2ae531f..28bfc377bd 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file cub::DevicePartition provides device-wide, parallel operations for -//! partitioning sequences of data items residing within device-accessible memory. +//! @file +//! cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing +//! within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh index c653badc47..a14c5e4364 100644 --- a/cub/cub/device/device_radix_sort.cuh +++ b/cub/cub/device/device_radix_sort.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceRadixSort provides device-wide, parallel operations for -//! computing a radix sort across a sequence of data items residing within -//! device-accessible memory. +//! @file +//! cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data +//! items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index a31e641920..4b02129123 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceReduce provides device-wide, parallel operations for -//! computing a reduction across a sequence of data items residing within -//! device-accessible memory. +//! @file +//! cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data +//! items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index f3b1a3e669..9020f4fe05 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceRunLengthEncode provides device-wide, parallel operations -//! for computing a run-length encoding across a sequence of data items -//! residing within device-accessible memory. +//! @file +//! cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a +//! sequence of data items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 29f3cf6c1e..1ec282d978 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across -//! a sequence of data items residing within device-accessible memory. +//! @file +//! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data +//! items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index eb6eecdcf3..cc627b971c 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -26,8 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort -//! across multiple, non-overlapping sequences of data items residing within device-accessible memory. +//! @file +//! cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across +//! multiple, non-overlapping sequences of data items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index 6a0875734e..ec5d017fc2 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSegmentedReduce provides device-wide, parallel operations -//! for computing a batched reduction across multiple sequences of data -//! items residing within device-accessible memory. +//! @file +//! cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across +//! multiple sequences of data items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh index 67a22c5e54..7d01b6d56a 100644 --- a/cub/cub/device/device_segmented_sort.cuh +++ b/cub/cub/device/device_segmented_sort.cuh @@ -25,9 +25,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSegmentedSort provides device-wide, parallel operations for -//! computing a batched sort across multiple, non-overlapping sequences of -//! data items residing within device-accessible memory. +//! @file +//! cub::DeviceSegmentedSort provides device-wide, parallel operations for computing a batched sort across multiple, +//! non-overlapping sequences of data items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 703a912829..332bbe6c7d 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -26,9 +26,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSelect provides device-wide, parallel operations for -//! compacting selected items from sequences of data items residing within -//! device-accessible memory. +//! @file +//! cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data +//! items residing within device-accessible memory. #pragma once diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh index 32ac433f3e..8b7e60d435 100644 --- a/cub/cub/device/device_spmv.cuh +++ b/cub/cub/device/device_spmv.cuh @@ -27,8 +27,9 @@ * ******************************************************************************/ -//! @file cub::DeviceSpmv provides device-wide parallel operations for performing -//! sparse-matrix * vector multiplication (SpMV). +//! @file +//! cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication +//! (SpMV). #pragma once diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh index d5beca2f6b..d605294693 100644 --- a/cub/cub/util_cpp_dialect.cuh +++ b/cub/cub/util_cpp_dialect.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file Detect the version of the C++ standard used by the compiler. +//! @file +//! Detect the version of the C++ standard used by the compiler. #pragma once diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh index bfcef99656..ac5c700b95 100644 --- a/cub/cub/warp/warp_load.cuh +++ b/cub/cub/warp/warp_load.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file Operations for reading linear tiles of data into the CUDA warp. +//! @file +//! Operations for reading linear tiles of data into the CUDA warp. #pragma once diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh index e123330ba1..bb99bc5965 100644 --- a/cub/cub/warp/warp_store.cuh +++ b/cub/cub/warp/warp_store.cuh @@ -25,7 +25,8 @@ * ******************************************************************************/ -//! @file Operations for writing linear segments of data from the CUDA warp +//! @file +//! Operations for writing linear segments of data from the CUDA warp #pragma once diff --git a/cub/test/catch2_main.cuh b/cub/test/catch2_main.cuh index fc08aa13eb..1d42355ce7 100644 --- a/cub/test/catch2_main.cuh +++ b/cub/test/catch2_main.cuh @@ -29,10 +29,10 @@ #include -//! @file This file includes a custom Catch2 main function. When CMake is configured to build -//! each test as a separate executable, this header is included into each test. On the other -//! hand, when all the tests are compiled into a single executable, this header is excluded -//! from the tests and included into catch2_runner.cpp +//! @file +//! This file includes a custom Catch2 main function. When CMake is configured to build each test as a separate +//! executable, this header is included into each test. On the other hand, when all the tests are compiled into a single +//! executable, this header is excluded from the tests and included into catch2_runner.cpp #ifdef CUB_CONFIG_MAIN # define CATCH_CONFIG_RUNNER diff --git a/cub/test/catch2_runner.cpp b/cub/test/catch2_runner.cpp index 53a19f7b6a..73f3f70d8a 100644 --- a/cub/test/catch2_runner.cpp +++ b/cub/test/catch2_runner.cpp @@ -25,8 +25,9 @@ * ******************************************************************************/ -//! @file This file includes a custom Catch2 main function when CMake is configured to build -//! all tests into a single executable. +//! @file +//! This file includes a custom Catch2 main function when CMake is configured to build all tests into a single +//! executable. #define CUB_CONFIG_MAIN #define CUB_EXCLUDE_CATCH2_HELPER_IMPL diff --git a/cub/test/catch2_runner_helper.cu b/cub/test/catch2_runner_helper.cu index d16e09f742..628f9525f7 100644 --- a/cub/test/catch2_runner_helper.cu +++ b/cub/test/catch2_runner_helper.cu @@ -25,9 +25,9 @@ * ******************************************************************************/ -//! @file This file includes CUDA-specific utilities for custom Catch2 main function when CMake is -//! configured to build all tests into a single executable. In this case, we have to have -//! a CUDA target in the final Catch2 executable, otherwise CMake confuses linker options and -//! MSVC/RDC build fails. +//! @file +//! This file includes CUDA-specific utilities for custom Catch2 main function when CMake is configured to build all +//! tests into a single executable. In this case, we have to have a CUDA target in the final Catch2 executable, +//! otherwise CMake confuses linker options and MSVC/RDC build fails. #include "catch2_runner_helper.inl" diff --git a/cub/test/catch2_runner_helper.inl b/cub/test/catch2_runner_helper.inl index 3971760800..f8a2bfa2ab 100644 --- a/cub/test/catch2_runner_helper.inl +++ b/cub/test/catch2_runner_helper.inl @@ -27,10 +27,10 @@ #pragma once -//! @file This file includes implementation of CUDA-specific utilities for custom Catch2 main -//! When CMake is configured to include all the tests into a single executable, this file -//! is only included into catch2_runner_helper.cu. When CMake is configured to compile -//! each test as a separate binary, this file is included into each test. +//! @file +//! This file includes implementation of CUDA-specific utilities for custom Catch2 main When CMake is configured to +//! include all the tests into a single executable, this file is only included into catch2_runner_helper.cu. When CMake +//! is configured to compile each test as a separate binary, this file is included into each test. #include diff --git a/cub/test/catch2_test_launch_helper.h b/cub/test/catch2_test_launch_helper.h index 311fea93b1..11da1a32e3 100644 --- a/cub/test/catch2_test_launch_helper.h +++ b/cub/test/catch2_test_launch_helper.h @@ -31,7 +31,8 @@ #include "catch2_test_helper.h" -//! @file This file contains utilities for device-scope API tests +//! @file +//! This file contains utilities for device-scope API tests //! //! Device-scope API in CUB can be launched from the host or device side. //! Utilities in this file facilitate testing in both cases. diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.h b/cudax/include/cuda/experimental/__container/uninitialized_buffer.h index 105d939322..b20d7b2a93 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.h +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.h @@ -30,7 +30,8 @@ #if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \ && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) -//! @file The \c uninitialized_buffer class provides a typed buffer allocated from a given memory resource. +//! @file +//! The \c uninitialized_buffer class provides a typed buffer allocated from a given memory resource. namespace cuda::experimental { diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource index d3c1ae1f91..d2e4296b74 100644 --- a/libcudacxx/include/cuda/memory_resource +++ b/libcudacxx/include/cuda/memory_resource @@ -21,7 +21,8 @@ #endif // no system header //!@rst -//! @file Defines facilities to allocate and deallocate memory in a type safe manner +//! @file +//! Defines facilities to allocate and deallocate memory in a type safe manner //! //! .. note:: //! diff --git a/thrust/thrust/system/cpp/pointer.h b/thrust/thrust/system/cpp/pointer.h index 5b5dfd6326..1913886157 100644 --- a/thrust/thrust/system/cpp/pointer.h +++ b/thrust/thrust/system/cpp/pointer.h @@ -14,9 +14,8 @@ * limitations under the License. */ -/*! \file thrust/system/cpp/memory.h - * \brief Managing memory associated with Thrust's TBB system. - */ +//! \file +//! Managing memory associated with Thrust's TBB system. #pragma once From 38d578727005ea5b558ad076bb03cb3336acb036 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 20 Aug 2024 10:26:26 +0200 Subject: [PATCH 04/71] Add tests for transform_iterator's reference type (#2221) * Fix typo * Add tests for transform_iterator's reference type --- thrust/testing/transform_iterator.cu | 102 +++++++++++++++++++++++++++ thrust/thrust/detail/reference.h | 2 +- 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/thrust/testing/transform_iterator.cu b/thrust/testing/transform_iterator.cu index 7bb87d4625..53bc4db00a 100644 --- a/thrust/testing/transform_iterator.cu +++ b/thrust/testing/transform_iterator.cu @@ -6,6 +6,7 @@ #include #include +#include #include @@ -108,3 +109,104 @@ void TestTransformIteratorNonCopyable() } DECLARE_UNITTEST(TestTransformIteratorNonCopyable); + +struct flip_value +{ + _CCCL_HOST_DEVICE bool operator()(bool b) const + { + return !b; + } +}; + +struct pass_ref +{ + _CCCL_HOST_DEVICE const bool& operator()(const bool& b) const + { + return b; + } +}; + +// TODO(bgruber): replace by libc++ with C++14 +struct forward +{ + template + constexpr _Tp&& operator()(_Tp&& __t) const noexcept + { + return _CUDA_VSTD::forward<_Tp>(__t); + } +}; + +void TestTransformIteratorReferenceAndValueType() +{ + using ::cuda::std::is_same; + using ::cuda::std::negate; + { + thrust::host_vector v; + + auto it = v.begin(); + static_assert(is_same::value, ""); // ordinary reference + static_assert(is_same::value, ""); + + auto it_tr_val = thrust::make_transform_iterator(it, flip_value{}); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + (void) it_tr_val; + + auto it_tr_ref = thrust::make_transform_iterator(it, pass_ref{}); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + (void) it_tr_ref; + + auto it_tr_fwd = thrust::make_transform_iterator(it, forward{}); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + (void) it_tr_fwd; + } + + { + thrust::device_vector v; + + auto it = v.begin(); + static_assert(is_same>::value, ""); // proxy reference + static_assert(is_same::value, ""); + + auto it_tr_val = thrust::make_transform_iterator(it, flip_value{}); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + (void) it_tr_val; + + auto it_tr_ref = thrust::make_transform_iterator(it, pass_ref{}); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + (void) it_tr_ref; + + auto it_tr_fwd = thrust::make_transform_iterator(it, forward{}); + static_assert(is_same::value, ""); // wrapped reference is decayed + static_assert(is_same::value, ""); + (void) it_tr_fwd; + } + + { + std::vector v; + + auto it = v.begin(); + static_assert(is_same::reference>::value, ""); // proxy reference + static_assert(is_same::value, ""); + + auto it_tr_val = thrust::make_transform_iterator(it, flip_value{}); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + (void) it_tr_val; + + auto it_tr_ref = thrust::make_transform_iterator(it, pass_ref{}); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + (void) it_tr_ref; + + auto it_tr_fwd = thrust::make_transform_iterator(it, forward{}); + static_assert(is_same::value, ""); // proxy reference is decayed + static_assert(is_same::value, ""); + (void) it_tr_fwd; + } +} +DECLARE_UNITTEST(TestTransformIteratorReferenceAndValueType); diff --git a/thrust/thrust/detail/reference.h b/thrust/thrust/detail/reference.h index c96649f74e..88fdf3d87d 100644 --- a/thrust/thrust/detail/reference.h +++ b/thrust/thrust/detail/reference.h @@ -308,7 +308,7 @@ class reference pointer const ptr; // `thrust::detail::is_wrapped_reference` is a trait that indicates whether - // a type is a fancy reference. It detects such types by loooking for a + // a type is a fancy reference. It detects such types by looking for a // nested `wrapped_reference_hint` type. struct wrapped_reference_hint {}; From c92e8d48ab0e673dceece34de31c94b5e8f20f91 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 20 Aug 2024 12:51:29 +0200 Subject: [PATCH 05/71] Small tuning script output improvements (#2262) * Report mismatched values in first_val * Improve output while search.py is running --- benchmarks/scripts/cccl/bench/bench.py | 6 +++--- benchmarks/scripts/cccl/bench/cmake.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/scripts/cccl/bench/bench.py b/benchmarks/scripts/cccl/bench/bench.py index f93f2eff57..049dcbb601 100644 --- a/benchmarks/scripts/cccl/bench/bench.py +++ b/benchmarks/scripts/cccl/bench/bench.py @@ -19,7 +19,7 @@ def first_val(my_dict): first_value = values[0] if not all(value == first_value for value in values): - raise ValueError('All values in the dictionary are not equal') + raise ValueError('All values in the dictionary are not equal. First value: {} All values: {}'.format(first_value, values)) return first_value @@ -648,11 +648,11 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True): p.wait(timeout=timeout) elapsed = time.time() - begin - logger.info("finished benchmark {} with {} ({}) in {}s".format(self.label(), ct_point, p.returncode, elapsed)) + logger.info("finished benchmark {} with {} ({}) in {:.3f}s".format(self.label(), ct_point, p.returncode, elapsed)) return BenchResult(result_path, p.returncode, elapsed) except subprocess.TimeoutExpired: - logger.info("benchmark {} with {} reached timeout of {}s".format(self.label(), ct_point, timeout)) + logger.info("benchmark {} with {} reached timeout of {:.3f}s".format(self.label(), ct_point, timeout)) os.killpg(os.getpgid(p.pid), signal.SIGTERM) return BenchResult(None, 42, float('inf')) diff --git a/benchmarks/scripts/cccl/bench/cmake.py b/benchmarks/scripts/cccl/bench/cmake.py index 095531a005..4340c999c6 100644 --- a/benchmarks/scripts/cccl/bench/cmake.py +++ b/benchmarks/scripts/cccl/bench/cmake.py @@ -80,7 +80,7 @@ def do_build(self, bench, timeout): stderr=subprocess.DEVNULL) p.wait(timeout=timeout) elapsed = time.time() - begin - logger.info("finished build for {} ({}) in {}s".format(bench.label(), p.returncode, elapsed)) + logger.info("finished build for {} (exit code: {}) in {:.3f}s".format(bench.label(), p.returncode, elapsed)) return Build(p.returncode, elapsed) except subprocess.TimeoutExpired: From 7bec0ce1647c802f6dd245d1dd6bb9ac8adab62e Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 20 Aug 2024 18:17:37 +0200 Subject: [PATCH 06/71] Fix Thrust::vector ctor selection for int,int (#2261) thrust::device_vector v(5, 10) should create a vector with 5 integers of value 10, and not attempt the iterator pair constructor. --- thrust/thrust/detail/vector_base.h | 8 ++++++-- thrust/thrust/detail/vector_base.inl | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/thrust/thrust/detail/vector_base.h b/thrust/thrust/detail/vector_base.h index e11b19cfbe..2da88cf5d8 100644 --- a/thrust/thrust/detail/vector_base.h +++ b/thrust/thrust/detail/vector_base.h @@ -37,6 +37,8 @@ #include #include +#include + #include #include @@ -186,7 +188,8 @@ class vector_base * \param first The beginning of the range. * \param last The end of the range. */ - template + template ::value, int> = 0> vector_base(InputIterator first, InputIterator last); /*! This constructor builds a vector_base from a range. @@ -194,7 +197,8 @@ class vector_base * \param last The end of the range. * \param alloc The allocator to use by this vector_base. */ - template + template ::value, int> = 0> vector_base(InputIterator first, InputIterator last, const Alloc& alloc); /*! The destructor erases the elements. diff --git a/thrust/thrust/detail/vector_base.inl b/thrust/thrust/detail/vector_base.inl index 5f0cb87e12..f37da9791e 100644 --- a/thrust/thrust/detail/vector_base.inl +++ b/thrust/thrust/detail/vector_base.inl @@ -270,7 +270,8 @@ void vector_base::range_init(ForwardIterator first, ForwardIterator la } // end vector_base::range_init() template -template +template ::value, int>> vector_base::vector_base(InputIterator first, InputIterator last) : m_storage() , m_size(0) @@ -283,7 +284,8 @@ vector_base::vector_base(InputIterator first, InputIterator last) } // end vector_base::vector_base() template -template +template ::value, int>> vector_base::vector_base(InputIterator first, InputIterator last, const Alloc& alloc) : m_storage(alloc) , m_size(0) From 06e334f3460088e5305369d5ed7bc4c9d960dcc3 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 21 Aug 2024 06:58:25 +0200 Subject: [PATCH 07/71] Adds support for large number of items to `DeviceScan` (#2171) * make DeviceScan offset type a template parameter * updates tests to use device interface * moves thrust scan to unsigned offset types * adjusts benchmarks to account for used offset types * uses dynamic dispatch to unsigned type * adds tparam docs for NumItemsT * fixes warning about different signedness comparison * adds check for negative num_items in thrust::scan * fixes unused param in is_negative --- cub/benchmarks/bench/scan/exclusive/base.cuh | 2 +- cub/cub/device/device_scan.cuh | 152 +++++++++++------- .../catch2_test_device_scan_large_offsets.cu | 41 ++--- thrust/thrust/detail/integer_math.h | 14 ++ thrust/thrust/system/cuda/detail/dispatch.h | 15 ++ thrust/thrust/system/cuda/detail/scan.h | 27 +++- 6 files changed, 159 insertions(+), 92 deletions(-) diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 65b760fba2..4289793167 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -129,7 +129,7 @@ static void basic(nvbench::state& state, nvbench::type_list) }); } -using some_offset_types = nvbench::type_list; +using some_offset_types = nvbench::type_list; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types)) .set_name("base") diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 1ec282d978..c9d93c935b 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -42,6 +42,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -153,6 +154,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -173,19 +177,19 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; using InitT = cub::detail::value_t; // Initial value @@ -196,13 +200,13 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -262,6 +266,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -279,20 +286,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -378,6 +385,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -404,7 +414,7 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -412,13 +422,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -432,7 +442,7 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -440,7 +450,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -521,6 +531,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -544,28 +557,28 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -656,6 +669,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -686,7 +702,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -694,13 +711,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -718,7 +735,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -726,7 +744,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -810,6 +828,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -833,28 +854,36 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -924,6 +953,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -944,32 +976,32 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1028,6 +1060,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -1045,20 +1080,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1138,6 +1173,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1162,20 +1200,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream); @@ -1221,6 +1259,9 @@ struct DeviceScan //! @tparam InitValueT //! **[inferred]** Type of the `init_value` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1247,7 +1288,7 @@ struct DeviceScan //! //! @param[in] stream //! CUDA stream to launch kernels within. - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1255,13 +1296,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; using AccumT = cub::detail::accumulator_t>; constexpr bool ForceInclusive = true; @@ -1284,14 +1325,14 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1365,6 +1406,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1386,26 +1430,26 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { diff --git a/cub/test/catch2_test_device_scan_large_offsets.cu b/cub/test/catch2_test_device_scan_large_offsets.cu index 9d00d89e14..0c0854e21e 100644 --- a/cub/test/catch2_test_device_scan_large_offsets.cu +++ b/cub/test/catch2_test_device_scan_large_offsets.cu @@ -35,33 +35,12 @@ #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" -// TODO(elstehle) replace with DeviceScan interface once https://github.com/NVIDIA/cccl/issues/50 is addressed -// Temporary wrapper that allows specializing the DeviceScan algorithm for different offset types -template -CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_scan_wrapper( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items, - cudaStream_t stream = 0) -{ - using init_value_t = cub::detail::InputValue; - init_value_t init_value_wrapper{init_value}; - - return cub::DispatchScan::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value_wrapper, num_items, stream); -} - -DECLARE_LAUNCH_WRAPPER(dispatch_scan_wrapper, dispatch_exclusive_scan); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_exclusive_scan); // %PARAM% TEST_LAUNCH lid 0:1:2 -// TODO(elstehle) replace with actual offset types, once https://github.com/NVIDIA/cccl/issues/50 is addresed // List of offset types to be used for testing large number of items -using offset_types = c2h::type_list; +using offset_types = c2h::type_list; template struct expected_sum_op @@ -106,12 +85,12 @@ try offset_t num_items_max = static_cast(num_items_max_ull); offset_t num_items_min = num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; - // TODO(elstehle) remove single-item size, once https://github.com/NVIDIA/cccl/issues/50 is addresed - offset_t num_items = - GENERATE_COPY(values({num_items_max, static_cast(num_items_max - 1), static_cast(1)}), - take(2, random(num_items_min, num_items_max))); + offset_t num_items = GENERATE_COPY( + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); - // Prepare input + // Prepare input (generate a series of: 0, 1, 2, ..., , 0, 1, 2, ..., , 0, 1, ...) constexpr index_t segment_size = 1000; auto index_it = thrust::make_counting_iterator(index_t{}); auto items_it = thrust::make_transform_iterator(index_it, mod_op{segment_size}); @@ -120,8 +99,12 @@ try c2h::device_vector d_items_out(num_items); auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data()); + c2h::device_vector d_initial_value(1); + d_initial_value[0] = item_t{}; + auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); + // Run test - dispatch_exclusive_scan(items_it, d_items_out_it, op_t{}, item_t{}, num_items); + device_exclusive_scan(items_it, d_items_out_it, op_t{}, future_init_value, num_items); // Ensure that we created the correct output auto expected_out_it = diff --git a/thrust/thrust/detail/integer_math.h b/thrust/thrust/detail/integer_math.h index ab37d9a3a9..730b084767 100644 --- a/thrust/thrust/detail/integer_math.h +++ b/thrust/thrust/detail/integer_math.h @@ -27,6 +27,8 @@ #endif // no system header #include +#include + #include #include @@ -60,6 +62,18 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x) return 0 == (x & (x - 1)); } +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T x) +{ + return x < 0; +} + +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T) +{ + return false; +} + template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_odd(Integer x) { diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h index 90c99688f7..971b93d628 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h @@ -90,6 +90,21 @@ status = call_64 arguments; \ } +/// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but dispatching to uint32_t and uint64_t, respectively, depending on the +/// `count` argument. `count` must not be negative. +#define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + if (static_cast(count) \ + <= static_cast(thrust::detail::integer_traits::const_max)) \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_32 arguments; \ + } \ + else \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_64 arguments; \ + } + /// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts. #define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \ if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index e225f2cfe4..e9405776db 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -42,6 +42,7 @@ # include +# include # include # include # include @@ -63,16 +64,21 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( thrust::cuda_cub::execution_policy& policy, InputIt first, Size num_items, OutputIt result, ScanOp scan_op) { using AccumT = typename thrust::iterator_traits::value_type; - using Dispatch32 = cub::DispatchScan; - using Dispatch64 = cub::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; + // Negative number of items are normalized to `0` + if(thrust::detail::is_negative(num_items)){ + num_items = 0; + } + // Determine temporary storage requirements: size_t tmp_size = 0; { - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -88,7 +94,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( { // Allocate temporary storage: thrust::detail::temporary_array tmp{policy, tmp_size}; - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -113,16 +119,21 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( ScanOp scan_op) { using InputValueT = cub::detail::InputValue; - using Dispatch32 = cub::DispatchScan; - using Dispatch64 = cub::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; + // Negative number of items are normalized to `0` + if(thrust::detail::is_negative(num_items)){ + num_items = 0; + } + // Determine temporary storage requirements: size_t tmp_size = 0; { - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -138,7 +149,7 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( { // Allocate temporary storage: thrust::detail::temporary_array tmp{policy, tmp_size}; - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, From 1e1af8d4d62ed15cc127cdb112d6b506e27668f7 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 21 Aug 2024 09:48:56 +0200 Subject: [PATCH 08/71] Use/Test radix sort for int128, half, bfloat16 in Thrust (#2168) int128 was already working but not covered by a test. --- thrust/testing/cuda/sort.cu | 59 +++++++++++++++++++++++-- thrust/testing/unittest/meta.h | 15 +++++++ thrust/thrust/system/cuda/detail/sort.h | 11 ++++- 3 files changed, 81 insertions(+), 4 deletions(-) diff --git a/thrust/testing/cuda/sort.cu b/thrust/testing/cuda/sort.cu index 8e7e5542e7..6962a39640 100644 --- a/thrust/testing/cuda/sort.cu +++ b/thrust/testing/cuda/sort.cu @@ -9,6 +9,7 @@ #include +#include #include #include @@ -164,9 +165,23 @@ struct TestRadixSortDispatch void operator()() const {} }; -// TODO(bgruber): use a single test case with a concatenated key list and a cartesion product with the comparators -SimpleUnitTest TestRadixSortDispatchIntegralInstance; -SimpleUnitTest TestRadixSortDispatchFPInstance; +SimpleUnitTest +#endif // _LIBCUDACXX_HAS_NO_INT128 +#ifdef _CCCL_HAS_NVFP16 + , + unittest::type_list<__half> +#endif // _CCCL_HAS_NVFP16 +#ifdef _CCCL_HAS_NVBF16 + , + unittest::type_list<__nv_bfloat16> +#endif // _CCCL_HAS_NVBF16 + >> + TestRadixSortDispatchInstance; /** * Copy of CUB testing utility @@ -263,3 +278,41 @@ void TestSortWithLargeNumberOfItems() TestSortWithMagnitude(33); } DECLARE_UNITTEST(TestSortWithLargeNumberOfItems); + +template +struct TestSortAscendingKey +{ + void operator()() const + { + constexpr int n = 10000; + + thrust::host_vector h_data = unittest::random_integers(n); + thrust::device_vector d_data = h_data; + + std::sort(h_data.begin(), h_data.end(), thrust::less{}); + thrust::sort(d_data.begin(), d_data.end(), thrust::less{}); + + ASSERT_EQUAL_QUIET(h_data, d_data); + } +}; + +SimpleUnitTest +#ifndef _LIBCUDACXX_HAS_NO_INT128 + , + unittest::type_list<__int128_t, __uint128_t> +#endif +// CTK 12.2 offers __host__ __device__ operators for __half and __nv_bfloat16, so we can use std::sort +#if _CCCL_CUDACC_VER >= 1202000 +# if defined(_CCCL_HAS_NVFP16) || !defined(__CUDA_NO_HALF_OPERATORS__) && !defined(__CUDA_NO_HALF_CONVERSIONS__) + , + unittest::type_list<__half> +# endif +# if defined(_CCCL_HAS_NVBF16) \ + || !defined(__CUDA_NO_BFLOAT16_OPERATORS__) && !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) + , + unittest::type_list<__nv_bfloat16> +# endif +#endif // _CCCL_CUDACC_VER >= 1202000 + >> + TestSortAscendingKeyMoreTypes; diff --git a/thrust/testing/unittest/meta.h b/thrust/testing/unittest/meta.h index 7fd90fa014..30cb835d55 100644 --- a/thrust/testing/unittest/meta.h +++ b/thrust/testing/unittest/meta.h @@ -157,4 +157,19 @@ struct transform2, type_list, Template> using type = type_list::type...>; }; +template +struct concat; + +template +struct concat +{ + using type = L; +}; + +template