From 387e1f520d217b3c041d149c1bb73d23256892e5 Mon Sep 17 00:00:00 2001 From: Georgii Evtushenko Date: Thu, 30 Nov 2023 04:07:23 +0400 Subject: [PATCH 1/4] Port device docs to rst (#1160) --- cub/cub/block/block_load.cuh | 2 +- cub/cub/block/block_merge_sort.cuh | 1 - cub/cub/block/block_radix_rank.cuh | 11 +- cub/cub/block/block_radix_sort.cuh | 6 - cub/cub/block/block_reduce.cuh | 2 +- cub/cub/block/block_run_length_decode.cuh | 6 +- cub/cub/block/block_shuffle.cuh | 4 +- cub/cub/device/device_adjacent_difference.cuh | 871 ++-- cub/cub/device/device_copy.cuh | 209 +- cub/cub/device/device_histogram.cuh | 2042 ++++---- cub/cub/device/device_memcpy.cuh | 213 +- cub/cub/device/device_merge_sort.cuh | 2 - cub/cub/device/device_partition.cuh | 843 ++-- cub/cub/device/device_radix_sort.cuh | 21 +- cub/cub/device/device_reduce.cuh | 1527 +++--- cub/cub/device/device_run_length_encode.cuh | 472 +- cub/cub/device/device_scan.cuh | 2815 ++++++------ .../device/device_segmented_radix_sort.cuh | 2177 +++++---- cub/cub/device/device_segmented_reduce.cuh | 1352 +++--- cub/cub/device/device_segmented_sort.cuh | 4094 +++++++++-------- cub/cub/device/device_select.cuh | 1223 +++-- cub/cub/device/device_spmv.cuh | 246 +- .../device/dispatch/dispatch_batch_memcpy.cuh | 6 +- .../device/dispatch/dispatch_histogram.cuh | 4 +- .../device/dispatch/dispatch_radix_sort.cuh | 36 +- cub/cub/device/dispatch/dispatch_reduce.cuh | 30 +- cub/cub/device/dispatch/dispatch_rle.cuh | 18 +- cub/cub/device/dispatch/dispatch_scan.cuh | 8 +- .../dispatch/dispatch_segmented_sort.cuh | 20 +- .../device/dispatch/dispatch_spmv_orig.cuh | 8 +- .../dispatch/dispatch_unique_by_key.cuh | 18 +- cub/cub/grid/grid_barrier.cuh | 9 - cub/cub/grid/grid_even_share.cuh | 9 - cub/cub/grid/grid_mapping.cuh | 10 - cub/cub/grid/grid_queue.cuh | 11 - cub/cub/iterator/arg_index_input_iterator.cuh | 9 - .../cache_modified_input_iterator.cuh | 11 - .../cache_modified_output_iterator.cuh | 9 - cub/cub/iterator/constant_input_iterator.cuh | 10 - cub/cub/iterator/counting_input_iterator.cuh | 9 - cub/cub/iterator/discard_output_iterator.cuh | 9 - cub/cub/iterator/tex_obj_input_iterator.cuh | 11 - cub/cub/iterator/tex_ref_input_iterator.cuh | 7 - cub/cub/iterator/transform_input_iterator.cuh | 10 - cub/cub/thread/thread_load.cuh | 8 - cub/cub/thread/thread_operators.cuh | 9 - cub/cub/thread/thread_scan.cuh | 11 +- cub/cub/thread/thread_store.cuh | 10 - cub/cub/util_allocator.cuh | 11 - cub/cub/util_cpp_dialect.cuh | 8 +- cub/cub/util_debug.cuh | 8 - cub/cub/util_device.cuh | 10 - cub/cub/util_macro.cuh | 9 +- cub/cub/util_ptx.cuh | 14 - cub/cub/util_temporary_storage.cuh | 7 - cub/cub/util_type.cuh | 10 - cub/cub/warp/warp_exchange.cuh | 1 - cub/docs/benchmarking.rst | 2 +- cub/docs/index.rst | 1 + cub/docs/repo.toml | 3 +- 60 files changed, 9022 insertions(+), 9511 deletions(-) diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index d95cca4e34..0a4d6d0be0 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -755,7 +755,7 @@ enum BlockLoadAlgorithm //! using CUDA's built-in vectorized loads as a coalescing optimization. //! #. :cpp:enumerator:`cub::BLOCK_LOAD_TRANSPOSE`: //! A :ref:`striped arrangement ` of data is read directly from memory and is then -//! locally transposed into a `blocked arrangement `. +//! locally transposed into a :ref:`blocked arrangement `. //! #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE`: //! A :ref:`warp-striped arrangement ` of data is read directly from memory and is then //! locally transposed into a :ref:`blocked arrangement `. diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh index f824e6025c..451a079d97 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -682,7 +682,6 @@ private: /** * @brief The BlockMergeSort class provides methods for sorting items * partitioned across a CUDA thread block using a merge sorting method. - * @ingroup BlockModule * * @tparam KeyT * KeyT type diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 7757dea1bc..09f6e14d20 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -276,7 +276,7 @@ private: BlockScan; - /// Shared memory storage layout type for BlockRadixRank + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document struct __align__(16) _TempStorage { union Aliasable @@ -289,6 +289,7 @@ private: // Storage for scanning local ranks typename BlockScan::TempStorage block_scan; }; + #endif // !DOXYGEN_SHOULD_SKIP_THIS /// Shared storage reference _TempStorage &temp_storage; @@ -634,7 +635,7 @@ private: BlockScanT; - /// Shared memory storage layout type for BlockRadixRank + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document struct __align__(16) _TempStorage { typename BlockScanT::TempStorage block_scan; @@ -646,6 +647,7 @@ private: } aliasable; }; + #endif // !DOXYGEN_SHOULD_SKIP_THIS /// Shared storage reference _TempStorage &temp_storage; @@ -657,7 +659,7 @@ private: public: - /// @smemstorage{BlockScan} + /// @smemstorage{BlockRadixRankMatch} struct TempStorage : Uninitialized<_TempStorage> {}; @@ -957,9 +959,6 @@ struct BlockRadixRankMatchEarlyCounts // types typedef cub::BlockScan BlockScan; - - - // temporary storage struct TempStorage { union diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index 538a806973..1d0cb52adb 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -182,8 +182,6 @@ CUB_NAMESPACE_BEGIN //! This example can be easily adapted to the storage required by BlockRadixSort. //! @endrst //! -//! @ingroup BlockModule -//! //! @tparam KeyT //! KeyT type //! @@ -2231,8 +2229,4 @@ public: }; -/** - * \example example_block_radix_sort.cu - */ - CUB_NAMESPACE_END diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh index cf7dc2fd64..dc240382d1 100644 --- a/cub/cub/block/block_reduce.cuh +++ b/cub/cub/block/block_reduce.cuh @@ -105,7 +105,7 @@ enum BlockReduceAlgorithm //! single warp rake across segments of shared partial reductions. //! #. A warp-synchronous Kogge-Stone style reduction within the raking warp. //! - //! @par Performance Considerations + //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - This variant performs more communication than BLOCK_REDUCE_RAKING diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh index f181835279..e4544fdc12 100644 --- a/cub/cub/block/block_run_length_decode.cuh +++ b/cub/cub/block/block_run_length_decode.cuh @@ -60,9 +60,9 @@ CUB_NAMESPACE_BEGIN //! the specified window will be returned. //! //! .. note:: -//! Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). -//! A run of length zero may not be followed by a run length that is not zero. -//! +//! +//! Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). +//! A run of length zero may not be followed by a run length that is not zero. //! //! .. code-block:: c++ //! diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh index 43ff0b190c..5b4b572543 100644 --- a/cub/cub/block/block_shuffle.cuh +++ b/cub/cub/block/block_shuffle.cuh @@ -185,7 +185,7 @@ public: //! //! - @smemreuse //! - //! @rst + //! @endrst //! //! @param[in] input //! The calling thread's input item @@ -311,7 +311,7 @@ public: //! @rst //! The thread block rotates its :ref:`blocked arrangement ` of input items, - //! shifting it down by one item. All threads receive ``input[0]` provided by *thread*\ :sub:`0`. + //! shifting it down by one item. All threads receive ``input[0]`` provided by *thread*\ :sub:`0`. //! //! - @blocked //! - @granularity diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh index cffdb5e153..5bcf9badbc 100644 --- a/cub/cub/device/device_adjacent_difference.cuh +++ b/cub/cub/device/device_adjacent_difference.cuh @@ -50,63 +50,66 @@ CUB_NAMESPACE_BEGIN -/** - * @brief DeviceAdjacentDifference provides device-wide, parallel operations for - * computing the differences of adjacent elements residing within - * device-accessible memory. - * - * @ingroup SingleModule - * - * @par Overview - * - DeviceAdjacentDifference calculates the differences of adjacent elements in - * d_input. Because the binary operation could be noncommutative, there - * are two sets of methods. Methods named SubtractLeft subtract left element - * `*(i - 1)` of input sequence from current element `*i`. - * Methods named `SubtractRight` subtract current element `*i` from the - * right one `*(i + 1)`: - * @par - * @code - * int *d_values; // [1, 2, 3, 4] - * //... - * int *d_subtract_left_result <-- [ 1, 1, 1, 1 ] - * int *d_subtract_right_result <-- [ -1, -1, -1, 4 ] - * @endcode - * - For SubtractLeft, if the left element is out of bounds, the iterator is - * assigned to \*(result + (i - first)) without modification. - * - For SubtractRight, if the right element is out of bounds, the iterator is - * assigned to \*(result + (i - first)) without modification. - * - * @par Snippet - * The code snippet below illustrates how to use @p DeviceAdjacentDifference to - * compute the left difference between adjacent elements. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * int num_items; // e.g., 8 - * int *d_values; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] - * //... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * - * cub::DeviceAdjacentDifference::SubtractLeft( - * d_temp_storage, temp_storage_bytes, d_values, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run operation - * cub::DeviceAdjacentDifference::SubtractLeft( - * d_temp_storage, temp_storage_bytes, d_values, num_items); - * - * // d_values <-- [1, 1, -1, 1, -1, 1, -1, 1] - * @endcode - */ +//! @rst +//! DeviceAdjacentDifference provides device-wide, parallel operations for +//! computing the differences of adjacent elements residing within +//! device-accessible memory. +//! +//! Overview +//! ++++++++++++++++++++++++++ +//! +//! - DeviceAdjacentDifference calculates the differences of adjacent elements in +//! d_input. Because the binary operation could be noncommutative, there +//! are two sets of methods. Methods named SubtractLeft subtract left element +//! ``*(i - 1)`` of input sequence from current element ``*i``. +//! Methods named ``SubtractRight`` subtract current element ``*i`` from the +//! right one ``*(i + 1)``: +//! +//! .. code-block:: c++ +//! +//! int *d_values; // [1, 2, 3, 4] +//! //... +//! int *d_subtract_left_result <-- [ 1, 1, 1, 1 ] +//! int *d_subtract_right_result <-- [ -1, -1, -1, 4 ] +//! +//! - For SubtractLeft, if the left element is out of bounds, the iterator is +//! assigned to ``*(result + (i - first))`` without modification. +//! - For SubtractRight, if the right element is out of bounds, the iterator is +//! assigned to ``*(result + (i - first))`` without modification. +//! +//! Snippet +//! ++++++++++++++++++++++++++ +//! +//! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` to +//! compute the left difference between adjacent elements. +//! +//! .. code-block:: c++ +//! +//! #include +//! // or equivalently +//! +//! // Declare, allocate, and initialize device-accessible pointers +//! int num_items; // e.g., 8 +//! int *d_values; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] +//! //... +//! +//! // Determine temporary device storage requirements +//! void *d_temp_storage = NULL; +//! size_t temp_storage_bytes = 0; +//! +//! cub::DeviceAdjacentDifference::SubtractLeft( +//! d_temp_storage, temp_storage_bytes, d_values, num_items); +//! +//! // Allocate temporary storage +//! cudaMalloc(&d_temp_storage, temp_storage_bytes); +//! +//! // Run operation +//! cub::DeviceAdjacentDifference::SubtractLeft( +//! d_temp_storage, temp_storage_bytes, d_values, num_items); +//! +//! // d_values <-- [1, 1, -1, 1, -1, 1, -1, 1] +//! +//! @endrst struct DeviceAdjacentDifference { private: @@ -145,106 +148,113 @@ private: public: - /** - * @brief Subtracts the left element of each adjacent pair of elements residing within device-accessible memory. - * @ingroup SingleModule - * - * @par Overview - * - Calculates the differences of adjacent elements in `d_input`. That is, - * `*d_input` is assigned to `*d_output`, and, for each iterator `i` in the - * range `[d_input + 1, d_input + num_items)`, the result of - * `difference_op(*i, *(i - 1))` is assigned to - * `*(d_output + (i - d_input))`. - * - Note that the behavior is undefined if the input and output ranges - * overlap in any way. - * - * @par Snippet - * The code snippet below illustrates how to use @p DeviceAdjacentDifference - * to compute the difference between adjacent elements. - * - * @par - * @code - * #include - * // or equivalently - * - * struct CustomDifference - * { - * template - * __device__ DataType operator()(DataType &lhs, DataType &rhs) - * { - * return lhs - rhs; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers - * int num_items; // e.g., 8 - * int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] - * int *d_output; - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * - * cub::DeviceAdjacentDifference::SubtractLeftCopy( - * d_temp_storage, temp_storage_bytes, - * d_input, d_output, - * num_items, CustomDifference()); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run operation - * cub::DeviceAdjacentDifference::SubtractLeftCopy( - * d_temp_storage, temp_storage_bytes, - * d_input, d_output, - * num_items, CustomDifference()); - * - * // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2] - * // d_output <-- [1, 1, -1, 1, -1, 1, -1, 1] - * @endcode - * - * @tparam InputIteratorT - * is a model of Input Iterator, - * and `x` and `y` are objects of `InputIteratorT`'s `value_type`, then - * `x - y` is defined, and `InputIteratorT`'s `value_type` is convertible to - * a type in `OutputIteratorT`'s set of `value_types`, and the return type - * of `x - y` is convertible to a type in `OutputIteratorT`'s set of - * `value_types`. - * - * @tparam OutputIteratorT - * is a model of Output Iterator. - * - * @tparam DifferenceOpT - * Its `result_type` is convertible to a type in `OutputIteratorT`'s set of - * `value_types`. - * - * @tparam NumItemsT **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_input - * Pointer to the input sequence - * - * @param[out] d_output - * Pointer to the output sequence - * - * @param[in] num_items - * Number of items in the input sequence - * - * @param[in] difference_op - * The binary function used to compute differences - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0 - */ + //! @rst + //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory + //! + //! Overview + //! ++++++++++++++++++++++++++ + //! + //! - Calculates the differences of adjacent elements in ``d_input``. + //! That is, ``*d_input`` is assigned to ``*d_output``, and, for each iterator ``i`` in the + //! range ``[d_input + 1, d_input + num_items)``, the result of + //! ``difference_op(*i, *(i - 1))`` is assigned to ``*(d_output + (i - d_input))``. + //! - Note that the behavior is undefined if the input and output ranges + //! overlap in any way. + //! + //! Snippet + //! ++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` + //! to compute the difference between adjacent elements. + //! + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! struct CustomDifference + //! { + //! template + //! __device__ DataType operator()(DataType &lhs, DataType &rhs) + //! { + //! return lhs - rhs; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! int num_items; // e.g., 8 + //! int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] + //! int *d_output; + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! + //! cub::DeviceAdjacentDifference::SubtractLeftCopy( + //! d_temp_storage, temp_storage_bytes, + //! d_input, d_output, + //! num_items, CustomDifference()); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run operation + //! cub::DeviceAdjacentDifference::SubtractLeftCopy( + //! d_temp_storage, temp_storage_bytes, + //! d_input, d_output, + //! num_items, CustomDifference()); + //! + //! // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2] + //! // d_output <-- [1, 1, -1, 1, -1, 1, -1, 1] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! @rst + //! is a model of `Input Iterator `_, + //! and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then + //! ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to + //! a type in ``OutputIteratorT``'s set of ``value_types``, and the return type + //! of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of + //! ``value_types``. + //! @endrst + //! + //! @tparam OutputIteratorT + //! @rst + //! is a model of `Output Iterator `_. + //! @endrst + //! + //! @tparam DifferenceOpT + //! Its `result_type` is convertible to a type in `OutputIteratorT`'s set of `value_types`. + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_input + //! Pointer to the input sequence + //! + //! @param[out] d_output + //! Pointer to the output sequence + //! + //! @param[in] num_items + //! Number of items in the input sequence + //! + //! @param[in] difference_op + //! The binary function used to compute differences + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0` + //! @endrst template - * // or equivalently - * - * struct CustomDifference - * { - * template - * __device__ DataType operator()(DataType &lhs, DataType &rhs) - * { - * return lhs - rhs; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers - * int num_items; // e.g., 8 - * int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceAdjacentDifference::SubtractLeft( - * d_temp_storage, temp_storage_bytes, - * d_data, num_items, CustomDifference()); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run operation - * cub::DeviceAdjacentDifference::SubtractLeft( - * d_temp_storage, temp_storage_bytes, - * d_data, num_items, CustomDifference()); - * - * // d_data <-- [1, 1, -1, 1, -1, 1, -1, 1] - * @endcode - * - * @tparam RandomAccessIteratorT - * is a model of Random Access Iterator, - * `RandomAccessIteratorT` is mutable. If `x` and `y` are objects of - * `RandomAccessIteratorT`'s `value_type`, and `x - y` is defined, then the - * return type of `x - y` should be convertible to a type in - * `RandomAccessIteratorT`'s set of `value_types`. - * - * @tparam DifferenceOpT - * Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s - * set of `value_types`. - * - * @tparam NumItemsT **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in,out] d_input - * Pointer to the input sequence and the result - * - * @param[in] num_items - * Number of items in the input sequence - * - * @param[in] difference_op - * The binary function used to compute differences - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory. + //! + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! Calculates the differences of adjacent elements in ``d_input``. That is, for + //! each iterator ``i`` in the range ``[d_input + 1, d_input + num_items)``, the + //! result of ``difference_op(*i, *(i - 1))`` is assigned to + //! ``*(d_input + (i - d_input))``. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` + //! to compute the difference between adjacent elements. + //! + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! struct CustomDifference + //! { + //! template + //! __device__ DataType operator()(DataType &lhs, DataType &rhs) + //! { + //! return lhs - rhs; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! int num_items; // e.g., 8 + //! int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceAdjacentDifference::SubtractLeft( + //! d_temp_storage, temp_storage_bytes, + //! d_data, num_items, CustomDifference()); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run operation + //! cub::DeviceAdjacentDifference::SubtractLeft( + //! d_temp_storage, temp_storage_bytes, + //! d_data, num_items, CustomDifference()); + //! + //! // d_data <-- [1, 1, -1, 1, -1, 1, -1, 1] + //! + //! @endrst + //! + //! @tparam RandomAccessIteratorT + //! @rst + //! is a model of `Random Access Iterator `_, + //! ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of + //! ``RandomAccessIteratorT``'s ``value_type``, and ``x - y`` is defined, then the + //! return type of ``x - y`` should be convertible to a type in + //! ``RandomAccessIteratorT``'s set of ``value_types``. + //! @endrst + //! + //! @tparam DifferenceOpT + //! Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s + //! set of `value_types`. + //! + //! @tparam NumItemsT + //! **[inferred]** Type of `num_items` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_input + //! Pointer to the input sequence and the result + //! + //! @param[in] num_items + //! Number of items in the input sequence + //! + //! @param[in] difference_op + //! The binary function used to compute differences + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -429,106 +445,114 @@ public: stream); } - /** - * @brief Subtracts the right element of each adjacent pair of elements - * residing within device-accessible memory. - * - * @ingroup SingleModule - * - * @par Overview - * - Calculates the right differences of adjacent elements in `d_input`. That - * is, `*(d_input + num_items - 1)` is assigned to - * `*(d_output + num_items - 1)`, and, for each iterator `i` in the range - * `[d_input, d_input + num_items - 1)`, the result of - * `difference_op(*i, *(i + 1))` is assigned to - * `*(d_output + (i - d_input))`. - * - Note that the behavior is undefined if the input and output ranges - * overlap in any way. - * - * @par Snippet - * The code snippet below illustrates how to use @p DeviceAdjacentDifference - * to compute the difference between adjacent elements. - * - * @par - * @code - * #include - * // or equivalently - * - * struct CustomDifference - * { - * template - * __device__ DataType operator()(DataType &lhs, DataType &rhs) - * { - * return lhs - rhs; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers - * int num_items; // e.g., 8 - * int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] - * int *d_output; - * .. - * - * // Determine temporary device storage requirements - * void *d_temp_storage = nullptr; - * size_t temp_storage_bytes = 0; - * cub::DeviceAdjacentDifference::SubtractRightCopy( - * d_temp_storage, temp_storage_bytes, - * d_input, d_output, num_items, CustomDifference()); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run operation - * cub::DeviceAdjacentDifference::SubtractRightCopy( - * d_temp_storage, temp_storage_bytes, - * d_input, d_output, num_items, CustomDifference()); - * - * // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2] - * // d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2] - * @endcode - * - * @tparam InputIteratorT - * is a model of Input Iterator, - * and `x` and `y` are objects of `InputIteratorT`'s `value_type`, then - * `x - y` is defined, and `InputIteratorT`'s `value_type` is convertible to - * a type in `OutputIteratorT`'s set of `value_types`, and the return type - * of `x - y` is convertible to a type in `OutputIteratorT`'s set of - * `value_types`. - * - * @tparam OutputIteratorT - * is a model of Output Iterator. - * - * @tparam DifferenceOpT - * Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s - * set of `value_types`. - * - * @tparam NumItemsT **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_input - * Pointer to the input sequence - * - * @param[out] d_output - * Pointer to the output sequence - * - * @param[in] num_items - * Number of items in the input sequence - * - * @param[in] difference_op - * The binary function used to compute differences. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. + //! + //! Overview + //! ++++++++++++++++++++++++++ + //! + //! - Calculates the right differences of adjacent elements in ``d_input``. + //! That is, ``*(d_input + num_items - 1)`` is assigned to + //! ``*(d_output + num_items - 1)``, and, for each iterator ``i`` in the range + //! ``[d_input, d_input + num_items - 1)``, the result of + //! ``difference_op(*i, *(i + 1))`` is assigned to + //! ``*(d_output + (i - d_input))``. + //! - Note that the behavior is undefined if the input and output ranges + //! overlap in any way. + //! + //! Snippet + //! ++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` + //! to compute the difference between adjacent elements. + //! + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! struct CustomDifference + //! { + //! template + //! __device__ DataType operator()(DataType &lhs, DataType &rhs) + //! { + //! return lhs - rhs; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! int num_items; // e.g., 8 + //! int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] + //! int *d_output; + //! .. + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = nullptr; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceAdjacentDifference::SubtractRightCopy( + //! d_temp_storage, temp_storage_bytes, + //! d_input, d_output, num_items, CustomDifference()); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run operation + //! cub::DeviceAdjacentDifference::SubtractRightCopy( + //! d_temp_storage, temp_storage_bytes, + //! d_input, d_output, num_items, CustomDifference()); + //! + //! // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2] + //! // d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! @rst + //! is a model of `Input Iterator `_, + //! and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then + //! ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to + //! a type in ``OutputIteratorT``'s set of ``value_types``, and the return type + //! of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of + //! ``value_types``. + //! @endrst + //! + //! @tparam OutputIteratorT + //! @rst + //! is a model of `Output Iterator `_. + //! @endrst + //! + //! @tparam DifferenceOpT + //! Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s + //! set of `value_types`. + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_input + //! Pointer to the input sequence + //! + //! @param[out] d_output + //! Pointer to the output sequence + //! + //! @param[in] num_items + //! Number of items in the input sequence + //! + //! @param[in] difference_op + //! The binary function used to compute differences. + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * int num_items; // e.g., 8 - * int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceAdjacentDifference::SubtractRight( - * d_temp_storage, temp_storage_bytes, d_data, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run operation - * cub::DeviceAdjacentDifference::SubtractRight( - * d_temp_storage, temp_storage_bytes, d_data, num_items); - * - * // d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2] - * @endcode - * - * @tparam RandomAccessIteratorT - * is a model of Random Access Iterator, - * `RandomAccessIteratorT` is mutable. If `x` and `y` are objects of - * `RandomAccessIteratorT`'s `value_type`, and `x - y` is defined, then the - * return type of `x - y` should be convertible to a type in - * `RandomAccessIteratorT`'s set of `value_types`. - * - * @tparam DifferenceOpT - * Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s - * set of `value_types`. - * - * @tparam NumItemsT **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_input - * Pointer to the input sequence - * - * @param[in] num_items - * Number of items in the input sequence - * - * @param[in] difference_op - * The binary function used to compute differences - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. + //! + //! Overview + //! ++++++++++++++++++++++++++ + //! + //! Calculates the right differences of adjacent elements in ``d_input``. + //! That is, for each iterator ``i`` in the range + //! ``[d_input, d_input + num_items - 1)``, the result of + //! ``difference_op(*i, *(i + 1))`` is assigned to ``*(d_input + (i - d_input))``. + //! + //! Snippet + //! ++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` + //! to compute the difference between adjacent elements. + //! + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! int num_items; // e.g., 8 + //! int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceAdjacentDifference::SubtractRight( + //! d_temp_storage, temp_storage_bytes, d_data, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run operation + //! cub::DeviceAdjacentDifference::SubtractRight( + //! d_temp_storage, temp_storage_bytes, d_data, num_items); + //! + //! // d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2] + //! + //! @endrst + //! + //! @tparam RandomAccessIteratorT + //! @rst + //! is a model of `Random Access Iterator `_, + //! ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of + //! ``RandomAccessIteratorT``'s `value_type`, and ``x - y`` is defined, then the + //! return type of ``x - y`` should be convertible to a type in + //! ``RandomAccessIteratorT``'s set of ``value_types``. + //! @endrst + //! + //! @tparam DifferenceOpT + //! Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s + //! set of `value_types`. + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_input + //! Pointer to the input sequence + //! + //! @param[in] num_items + //! Number of items in the input sequence + //! + //! @param[in] difference_op + //! The binary function used to compute differences + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh index f6c9151f4a..946f03f57f 100644 --- a/cub/cub/device/device_copy.cuh +++ b/cub/cub/device/device_copy.cuh @@ -25,10 +25,7 @@ * ******************************************************************************/ -/** - * \file - * cub::DeviceCopy provides device-wide, parallel operations for copying data. - */ +//! @file cub::DeviceCopy provides device-wide, parallel operations for copying data. #pragma once @@ -50,101 +47,119 @@ CUB_NAMESPACE_BEGIN -/** - * @brief cub::DeviceCopy provides device-wide, parallel operations for copying data. - * \ingroup SingleModule - */ +//! @brief cub::DeviceCopy provides device-wide, parallel operations for copying data. struct DeviceCopy { - /** - * @brief Copies data from a batch of given source ranges to their corresponding destination - * ranges. - * @note If any input range aliases any output range the behavior is undefined. If - * any output range aliases another output range the behavior is undefined. Input - * ranges can alias one another. - * - * @par Snippet - * The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength - * Decode operation. - * @par - * @code - * struct GetIteratorToRange - * { - * __host__ __device__ __forceinline__ auto operator()(uint32_t index) - * { - * return thrust::make_constant_iterator(d_data_in[index]); - * } - * int32_t *d_data_in; - * }; - * - * struct GetPtrToRange - * { - * __host__ __device__ __forceinline__ auto operator()(uint32_t index) - * { - * return d_data_out + d_offsets[index]; - * } - * int32_t *d_data_out; - * uint32_t *d_offsets; - * }; - * - * struct GetRunLength - * { - * __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index) - * { - * return d_offsets[index + 1] - d_offsets[index]; - * } - * uint32_t *d_offsets; - * }; - * - * uint32_t num_ranges = 5; - * int32_t *d_data_in; // e.g., [4, 2, 7, 3, 1] - * int32_t *d_data_out; // e.g., [0, ... ] - * uint32_t *d_offsets; // e.g., [0, 2, 5, 6, 9, 14] - * - * // Returns a constant iterator to the element of the i-th run - * thrust::counting_iterator iota(0); - * auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in}); - * - * // Returns the run length of the i-th run - * auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets}); - * - * // Returns pointers to the output range for each run - * auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets}); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = nullptr; - * size_t temp_storage_bytes = 0; - * cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes, - * num_ranges); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run batched copy algorithm (used to perform runlength decoding) - * cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes, - * num_ranges); - * - * // d_data_out <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1] - * @endcode - * @tparam InputIt [inferred] Device-accessible random-access input iterator type - * providing the iterators to the source ranges - * @tparam OutputIt [inferred] Device-accessible random-access input iterator type - * providing the iterators to the destination ranges - * @tparam SizeIteratorT [inferred] Device-accessible random-access input iterator - * type providing the number of items to be copied for each pair of ranges - * @param d_temp_storage [in] Device-accessible allocation of temporary storage. When NULL, the - * required allocation size is written to \p temp_storage_bytes and no work is done. - * @param temp_storage_bytes [in,out] Reference to size in bytes of \p d_temp_storage allocation - * @param input_it [in] Device-accessible iterator providing the iterators to the source - * ranges - * @param output_it [in] Device-accessible iterator providing the iterators to the - * destination ranges - * @param sizes [in] Device-accessible iterator providing the number of elements to be copied - * for each pair of ranges - * @param num_ranges [in] The total number of range pairs - * @param stream [in] [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Copies data from a batch of given source ranges to their corresponding destination ranges. + //! + //! .. note:: + //! + //! If any input range aliases any output range the behavior is undefined. + //! If any output range aliases another output range the behavior is undefined. + //! Input ranges can alias one another. + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength Decode operation. + //! + //! .. code-block:: c++ + //! + //! struct GetIteratorToRange + //! { + //! __host__ __device__ __forceinline__ auto operator()(uint32_t index) + //! { + //! return thrust::make_constant_iterator(d_data_in[index]); + //! } + //! int32_t *d_data_in; + //! }; + //! + //! struct GetPtrToRange + //! { + //! __host__ __device__ __forceinline__ auto operator()(uint32_t index) + //! { + //! return d_data_out + d_offsets[index]; + //! } + //! int32_t *d_data_out; + //! uint32_t *d_offsets; + //! }; + //! + //! struct GetRunLength + //! { + //! __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index) + //! { + //! return d_offsets[index + 1] - d_offsets[index]; + //! } + //! uint32_t *d_offsets; + //! }; + //! + //! uint32_t num_ranges = 5; + //! int32_t *d_data_in; // e.g., [4, 2, 7, 3, 1] + //! int32_t *d_data_out; // e.g., [0, ... ] + //! uint32_t *d_offsets; // e.g., [0, 2, 5, 6, 9, 14] + //! + //! // Returns a constant iterator to the element of the i-th run + //! thrust::counting_iterator iota(0); + //! auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in}); + //! + //! // Returns the run length of the i-th run + //! auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets}); + //! + //! // Returns pointers to the output range for each run + //! auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets}); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = nullptr; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes, + //! num_ranges); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run batched copy algorithm (used to perform runlength decoding) + //! cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes, + //! num_ranges); + //! + //! // d_data_out <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1] + //! + //! @endrst + //! + //! @tparam InputIt + //! **[inferred]** Device-accessible random-access input iterator type providing the iterators to the source ranges + //! + //! @tparam OutputIt + //! **[inferred]** Device-accessible random-access input iterator type providing the iterators to + //! the destination ranges + //! + //! @tparam SizeIteratorT + //! **[inferred]** Device-accessible random-access input iterator type providing the number of items to be + //! copied for each pair of ranges + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. + //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] input_it + //! Device-accessible iterator providing the iterators to the source ranges + //! + //! @param[in] output_it + //! Device-accessible iterator providing the iterators to the destination ranges + //! + //! @param[in] sizes + //! Device-accessible iterator providing the number of elements to be copied for each pair of ranges + //! + //! @param[in] num_ranges + //! The total number of range pairs + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Batched(void *d_temp_storage, size_t &temp_storage_bytes, diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index 8047a1a463..a6a3e0edd2 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -26,11 +26,9 @@ * ******************************************************************************/ -/** - * @file cub::DeviceHistogram provides device-wide parallel operations for - * constructing histogram(s) from a sequence of samples data residing - * within device-accessible memory. - */ +//! @file cub::DeviceHistogram provides device-wide parallel operations for +//! constructing histogram(s) from a sequence of samples data residing +//! within device-accessible memory. #pragma once @@ -56,134 +54,131 @@ CUB_NAMESPACE_BEGIN -/** - * @brief DeviceHistogram provides device-wide parallel operations for - * constructing histogram(s) from a sequence of samples data residing - * within device-accessible memory. ![](histogram_logo.png) - * @ingroup SingleModule - * - * @par Overview - * A histogram - * counts the number of observations that fall into each of the disjoint categories (known as bins). - * - * @par Usage Considerations - * @cdp_class{DeviceHistogram} - * - */ +//! @rst +//! DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of +//! samples data residing within device-accessible memory. +//! +//! Overview +//! ++++++++++++++++++++++++++ +//! +//! A `histogram `_ counts the number of observations that fall into each +//! of the disjoint categories (known as *bins*). +//! +//! Usage Considerations +//! ++++++++++++++++++++++++++ +//! +//! @cdp_class{DeviceHistogram} +//! +//! @endrst struct DeviceHistogram { - /******************************************************************//** - * @name Evenly-segmented bin ranges - *********************************************************************/ - //@{ + //! @name Evenly-segmented bin ranges + //! @{ - /** - * @brief Computes an intensity histogram from a sequence of data samples - * using equal-width bins. - * - * @par - * - The number of histogram bins is (`num_levels - 1`) - * - All bins comprise the same width of sample values: - * `(upper_level - lower_level) / (num_levels - 1)`. - * - If the common type of `SampleT` and `LevelT` is of integral type, the bin for a sample is - * computed as `(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)`, round - * down to the nearest whole number. To protect against potential overflows, if the product - * `(upper_level - lower_level) * (num_levels - 1)` exceeds the number representable by an - * `uint64_t`, the cuda error `cudaErrorInvalidValue` is returned. If the common type is 128 - * bits wide, bin computation will use 128-bit arithmetic and `cudaErrorInvalidValue` will only - * be returned if bin computation would overflow for 128-bit arithmetic. - * - The ranges `[d_samples, d_samples + num_samples)` and - * `[d_histogram, d_histogram + num_levels - 1)` shall not overlap - * in any way. - * - `cuda::std::common_type` must be valid, and both LevelT - * and SampleT must be valid arithmetic types. The common type must be - * convertible to `int` and trivially copyable. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the computation of a six-bin histogram - * from a sequence of float samples - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input samples and output histogram - * int num_samples; // e.g., 10 - * float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5] - * int* d_histogram; // e.g., [ -, -, -, -, -, -] - * int num_levels; // e.g., 7 (seven level boundaries for six bins) - * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) - * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramEven( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, - * lower_level, upper_level, num_samples); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramEven( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, - * lower_level, upper_level, num_samples); - * - * // d_histogram <-- [1, 5, 0, 3, 0, 0]; - * @endcode - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading input - * samples \iterator - * - * @tparam CounterT - * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT - * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, - * pointer differences, etc. \offset_size1 - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_samples - * The pointer to the input sequence of data samples. - * - * @param[out] d_histogram - * The pointer to the histogram counter output array of length - * `num_levels - 1`. - * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples. - * Implies that the number of bins is `num_levels - 1`. - * - * @param[in] lower_level - * The lower sample value bound (inclusive) for the lowest histogram bin. - * - * @param[in] upper_level - * The upper sample value bound (exclusive) for the highest histogram bin. - * - * @param[in] num_samples - * The number of input samples (i.e., the length of `d_samples`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes an intensity histogram from a sequence of data samples using equal-width bins. + //! + //! - The number of histogram bins is (``num_levels - 1``) + //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)``. + //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is + //! computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round + //! down to the nearest whole number. To protect against potential overflows, if the product + //! ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an + //! ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 + //! bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only + //! be returned if bin computation would overflow for 128-bit arithmetic. + //! - The ranges ``[d_samples, d_samples + num_samples)`` and + //! ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap in any way. + //! - ``cuda::std::common_type`` must be valid, and both LevelT and SampleT must be valid + //! arithmetic types. The common type must be convertible to ``int`` and trivially copyable. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the computation of a six-bin histogram + //! from a sequence of float samples + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input samples and output histogram + //! int num_samples; // e.g., 10 + //! float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5] + //! int* d_histogram; // e.g., [ -, -, -, -, -, -] + //! int num_levels; // e.g., 7 (seven level boundaries for six bins) + //! float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + //! float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) + //! ... + //! + //! // Determine temporary device storage requirements + //! void* d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceHistogram::HistogramEven( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, + //! lower_level, upper_level, num_samples); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Compute histograms + //! cub::DeviceHistogram::HistogramEven( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, + //! lower_level, upper_level, num_samples); + //! + //! // d_histogram <-- [1, 5, 0, 3, 0, 0]; + //! + //! @endrst + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input samples @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, + //! pointer differences, etc. @offset_size1 + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no + //! work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_samples + //! The pointer to the input sequence of data samples. + //! + //! @param[out] d_histogram + //! The pointer to the histogram counter output array of length + //! `num_levels - 1`. + //! + //! @param[in] num_levels + //! The number of boundaries (levels) for delineating histogram samples. + //! Implies that the number of bins is `num_levels - 1`. + //! + //! @param[in] lower_level + //! The lower sample value bound (inclusive) for the lowest histogram bin. + //! + //! @param[in] upper_level + //! The upper sample value bound (exclusive) for the highest histogram bin. + //! + //! @param[in] num_samples + //! The number of input samples (i.e., the length of `d_samples`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template ` must be valid, and both LevelT - * and SampleT must be valid arithmetic types. The common type must be - * convertible to `int` and trivially copyable. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the computation of a six-bin histogram - * from a 2x5 region of interest within a flattened 2x7 array of float samples. - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input samples and output histogram - * int num_row_samples; // e.g., 5 - * int num_rows; // e.g., 2; - * size_t row_stride_bytes; // e.g., 7 * sizeof(float) - * float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, -, -, - * // 0.3, 2.9, 2.1, 6.1, 999.5, -, -] - * int* d_histogram; // e.g., [ -, -, -, -, -, -] - * int num_levels; // e.g., 7 (seven level boundaries for six bins) - * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) - * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramEven( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_samples, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramEven( - * d_temp_storage, temp_storage_bytes, d_samples, d_histogram, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_samples, num_rows, row_stride_bytes); - * - * // d_histogram <-- [1, 5, 0, 3, 0, 0]; - * @endcode - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading - * input samples. \iterator - * - * @tparam CounterT - * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT - * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, - * pointer differences, etc. \offset_size1 - - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_samples - * The pointer to the input sequence of data samples. - * - * @param[out] d_histogram - * The pointer to the histogram counter output array of - * length `num_levels - 1`. - * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples. - * Implies that the number of bins is `num_levels - 1`. - * - * @param[in] lower_level - * The lower sample value bound (inclusive) for the lowest histogram bin. - * - * @param[in] upper_level - * The upper sample value bound (exclusive) for the highest histogram bin. - * - * @param[in] num_row_samples - * The number of data samples per row in the region of interest - * - * @param[in] num_rows - * The number of rows in the region of interest - * - * @param[in] row_stride_bytes - * The number of bytes between starts of consecutive rows in - * the region of interest - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes an intensity histogram from a sequence of data samples using equal-width bins. + //! + //! - A two-dimensional *region of interest* within ``d_samples`` can be specified using + //! the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. + //! - The row stride must be a whole multiple of the sample data type + //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. + //! - The number of histogram bins is (``num_levels - 1``) + //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)`` + //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is + //! computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round + //! down to the nearest whole number. To protect against potential overflows, if the product + //! ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an + //! ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 + //! bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only + //! be returned if bin computation would overflow for 128-bit arithmetic. + //! - For a given row ``r`` in ``[0, num_rows)``, let + //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and + //! ``row_end = row_begin + num_row_samples``. The ranges + //! ``[row_begin, row_end)`` and ``[d_histogram, d_histogram + num_levels - 1)`` + //! shall not overlap in any way. + //! - ``cuda::std::common_type`` must be valid, and both LevelT + //! and SampleT must be valid arithmetic types. The common type must be + //! convertible to ``int`` and trivially copyable. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the computation of a six-bin histogram + //! from a 2x5 region of interest within a flattened 2x7 array of float samples. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input samples and output histogram + //! int num_row_samples; // e.g., 5 + //! int num_rows; // e.g., 2; + //! size_t row_stride_bytes; // e.g., 7 * sizeof(float) + //! float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, -, -, + //! // 0.3, 2.9, 2.1, 6.1, 999.5, -, -] + //! int* d_histogram; // e.g., [ -, -, -, -, -, -] + //! int num_levels; // e.g., 7 (seven level boundaries for six bins) + //! float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + //! float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) + //! ... + //! + //! // Determine temporary device storage requirements + //! void* d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceHistogram::HistogramEven( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, lower_level, upper_level, + //! num_row_samples, num_rows, row_stride_bytes); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Compute histograms + //! cub::DeviceHistogram::HistogramEven( + //! d_temp_storage, temp_storage_bytes, d_samples, d_histogram, + //! d_samples, d_histogram, num_levels, lower_level, upper_level, + //! num_row_samples, num_rows, row_stride_bytes); + //! + //! // d_histogram <-- [1, 5, 0, 3, 0, 0]; + //! + //! @endrst + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading + //! input samples. @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, + //! pointer differences, etc. @offset_size1 + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no + //! work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_samples + //! The pointer to the input sequence of data samples. + //! + //! @param[out] d_histogram + //! The pointer to the histogram counter output array of + //! length `num_levels - 1`. + //! + //! @param[in] num_levels + //! The number of boundaries (levels) for delineating histogram samples. + //! Implies that the number of bins is `num_levels - 1`. + //! + //! @param[in] lower_level + //! The lower sample value bound (inclusive) for the lowest histogram bin. + //! + //! @param[in] upper_level + //! The upper sample value bound (exclusive) for the highest histogram bin. + //! + //! @param[in] num_row_samples + //! The number of data samples per row in the region of interest + //! + //! @param[in] num_rows + //! The number of rows in the region of interest + //! + //! @param[in] row_stride_bytes + //! The number of bytes between starts of consecutive rows in + //! the region of interest + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template i is - * `num_levels[i] - 1`. - * - For channeli, the range of values for all histogram bins - * have the same width: - * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` - * - If the common type of sample and level is of integral type, the bin for a sample is - * computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - - * lower_level[i])`, round down to the nearest whole number. To protect against potential - * overflows, if, for any channel `i`, the product `(upper_level[i] - lower_level[i]) * - * (num_levels[i] - 1)` exceeds the number representable by an `uint64_t`, the cuda error - * `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation - * will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin - * computation would overflow for 128-bit arithmetic. - * - For a given channel `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges - * `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` and - * `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap - * in any way. - * - `cuda::std::common_type` must be valid, and both LevelT - * and SampleT must be valid arithmetic types. The common type must be - * convertible to `int` and trivially copyable. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the computation of three 256-bin RGB histograms - * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input samples and output histograms - * int num_pixels; // e.g., 5 - * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), - * // (0, 6, 7, 5), (3, 0, 2, 6)] - * int* d_histogram[3]; // e.g., three device pointers to three device buffers, - * // each allocated with 256 integer counters - * int num_levels[3]; // e.g., {257, 257, 257}; - * unsigned int lower_level[3]; // e.g., {0, 0, 0}; - * unsigned int upper_level[3]; // e.g., {256, 256, 256}; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramEven<4, 3>( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, - * lower_level, upper_level, num_pixels); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramEven<4, 3>( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, - * lower_level, upper_level, num_pixels); - * - * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], - * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], - * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] - * @endcode - * - * @tparam NUM_CHANNELS - * Number of channels interleaved in the input data (may be greater than - * the number of channels being actively histogrammed) - * - * @tparam NUM_ACTIVE_CHANNELS - * **[inferred]** Number of channels actively being histogrammed - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading - * input samples. \iterator - * - * @tparam CounterT - * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT - * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, - * pointer differences, etc. \offset_size1 - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_samples - * The pointer to the multi-channel input sequence of data samples. - * The samples from different channels are assumed to be interleaved - * (e.g., an array of 32-bit pixels where each pixel consists of four - * *RGBA* 8-bit samples). - * - * @param[out] d_histogram - * The pointers to the histogram counter output arrays, one for each active - * channel. For channeli, the allocation length of - * `d_histogram[i]` should be `num_levels[i] - 1`. - * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples in - * each active channel. Implies that the number of bins for - * channeli is `num_levels[i] - 1`. - * - * @param[in] lower_level - * The lower sample value bound (inclusive) for the lowest histogram bin in - * each active channel. - * - * @param[in] upper_level - * The upper sample value bound (exclusive) for the highest histogram bin - * in each active channel. - * - * @param[in] num_pixels - * The number of multi-channel pixels - * (i.e., the length of `d_samples / NUM_CHANNELS`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using + //! equal-width bins. + //! + //! - The input is a sequence of *pixel* structures, where each pixel comprises + //! a record of ``NUM_CHANNELS`` consecutive data samples + //! (e.g., an *RGBA* pixel). + //! - Of the ``NUM_CHANNELS`` specified, the function will only compute + //! histograms for the first ``NUM_ACTIVE_CHANNELS`` + //! (e.g., only *RGB* histograms from *RGBA* pixel samples). + //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: + //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` + //! - If the common type of sample and level is of integral type, the bin for a sample is + //! computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, round down + //! to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, the product + //! ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by an ``uint64_t``, + //! the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 bits wide, bin computation + //! will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only be returned if bin + //! computation would overflow for 128-bit arithmetic. + //! - For a given channel ``c`` in ``[0, NUM_ACTIVE_CHANNELS)``, the ranges + //! ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` and + //! ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way. + //! - ``cuda::std::common_type`` must be valid, and both LevelT + //! and SampleT must be valid arithmetic types. + //! The common type must be convertible to ``int`` and trivially copyable. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the computation of three 256-bin *RGB* histograms + //! from a quad-channel sequence of *RGBA* pixels (8 bits per channel per pixel) + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input samples and output histograms + //! int num_pixels; // e.g., 5 + //! unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), + //! // (0, 6, 7, 5), (3, 0, 2, 6)] + //! int* d_histogram[3]; // e.g., three device pointers to three device buffers, + //! // each allocated with 256 integer counters + //! int num_levels[3]; // e.g., {257, 257, 257}; + //! unsigned int lower_level[3]; // e.g., {0, 0, 0}; + //! unsigned int upper_level[3]; // e.g., {256, 256, 256}; + //! ... + //! + //! // Determine temporary device storage requirements + //! void* d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceHistogram::MultiHistogramEven<4, 3>( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, + //! lower_level, upper_level, num_pixels); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Compute histograms + //! cub::DeviceHistogram::MultiHistogramEven<4, 3>( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, + //! lower_level, upper_level, num_pixels); + //! + //! // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + //! // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + //! // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] + //! + //! @endrst + //! + //! @tparam NUM_CHANNELS + //! Number of channels interleaved in the input data (may be greater than + //! the number of channels being actively histogrammed) + //! + //! @tparam NUM_ACTIVE_CHANNELS + //! **[inferred]** Number of channels actively being histogrammed + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading + //! input samples. @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, + //! pointer differences, etc. @offset_size1 + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no + //! work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_samples + //! The pointer to the multi-channel input sequence of data samples. + //! The samples from different channels are assumed to be interleaved + //! (e.g., an array of 32-bit pixels where each pixel consists of four + //! *RGBA* 8-bit samples). + //! + //! @param[out] d_histogram + //! @rst + //! The pointers to the histogram counter output arrays, one for each active + //! channel. For channel\ :sub:`i`, the allocation length of + //! ``d_histogram[i]`` should be `num_levels[i] - 1``. + //! @endrst + //! + //! @param[in] num_levels + //! @rst + //! The number of boundaries (levels) for delineating histogram samples in each active channel. + //! Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! @endrst + //! + //! @param[in] lower_level + //! The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + //! + //! @param[in] upper_level + //! The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + //! + //! @param[in] num_pixels + //! The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template i is - * `num_levels[i] - 1`. - * - For channeli, the range of values for all histogram - * bins have the same width: - * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` - * - If the common type of sample and level is of integral type, the bin for a sample is - * computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - - * lower_level[i])`, round down to the nearest whole number. To protect against potential - * overflows, if, for any channel `i`, the product `(upper_level[i] - lower_level[i]) * - * (num_levels[i] - 1)` exceeds the number representable by an `uint64_t`, the cuda error - * `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation - * will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin - * computation would overflow for 128-bit arithmetic. - * - For a given row `r` in `[0, num_rows)`, and sample `s` in - * `[0, num_row_pixels)`, let - * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, - * `sample_begin = row_begin + s * NUM_CHANNELS`, and - * `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For a given channel - * `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges - * `[sample_begin, sample_end)` and - * `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap - * in any way. - * - `cuda::std::common_type` must be valid, and both LevelT - * and SampleT must be valid arithmetic types. The common type must be - * convertible to `int` and trivially copyable. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the computation of three 256-bin - * *RGB* histograms from a 2x3 region of interest of within a flattened 2x4 - * array of quad-channel *RGBA* pixels (8 bits per channel per pixel). - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input - * // samples and output histograms - * int num_row_pixels; // e.g., 3 - * int num_rows; // e.g., 2 - * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS - * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), - * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] - * int* d_histogram[3]; // e.g., three device pointers to three device buffers, - * // each allocated with 256 integer counters - * int num_levels[3]; // e.g., {257, 257, 257}; - * unsigned int lower_level[3]; // e.g., {0, 0, 0}; - * unsigned int upper_level[3]; // e.g., {256, 256, 256}; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramEven<4, 3>( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_pixels, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramEven<4, 3>( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_pixels, num_rows, row_stride_bytes); - * - * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], - * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], - * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] - * @endcode - * - * @tparam NUM_CHANNELS - * Number of channels interleaved in the input data (may be greater than - * the number of channels being actively histogrammed) - * - * @tparam NUM_ACTIVE_CHANNELS - * **[inferred]** Number of channels actively being histogrammed - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading input - * samples. \iterator - * - * @tparam CounterT - * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT - * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, - * pointer differences, etc. \offset_size1 - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_samples - * The pointer to the multi-channel input sequence of data samples. The - * samples from different channels are assumed to be interleaved (e.g., - * an array of 32-bit pixels where each pixel consists of four - * *RGBA* 8-bit samples). - * - * @param[out] d_histogram - * The pointers to the histogram counter output arrays, one for each - * active channel. For channeli, the allocation length - * of `d_histogram[i]` should be `num_levels[i] - 1`. - * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples in - * each active channel. Implies that the number of bins for - * channeli is `num_levels[i] - 1`. - * - * @param[in] lower_level - * The lower sample value bound (inclusive) for the lowest histogram bin in - * each active channel. - * - * @param[in] upper_level - * The upper sample value bound (exclusive) for the highest histogram bin - * in each active channel. - * - * @param[in] num_row_pixels - * The number of multi-channel pixels per row in the region of interest - * - * @param[in] num_rows - * The number of rows in the region of interest - * - * @param[in] row_stride_bytes - * The number of bytes between starts of consecutive rows in the region of - * interest - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes per-channel intensity histograms from a sequence of + //! multi-channel "pixel" data samples using equal-width bins. + //! + //! - The input is a sequence of *pixel* structures, where each pixel + //! comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel). + //! - Of the ``NUM_CHANNELS`` specified, the function will only compute + //! histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., only *RGB* + //! histograms from *RGBA* pixel samples). + //! - A two-dimensional *region of interest* within ``d_samples`` can be + //! specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. + //! - The row stride must be a whole multiple of the sample data type + //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. + //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: + //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` + //! - If the common type of sample and level is of integral type, the bin for a sample is + //! computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, + //! round down to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, + //! the product ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by + //! an ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. + //! If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` + //! will only be returned if bin computation would overflow for 128-bit arithmetic. + //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in + //! ``[0, num_row_pixels)``, let + //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``, + //! ``sample_begin = row_begin + s * NUM_CHANNELS``, and + //! ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For a given channel + //! ``c`` in ``[0, NUM_ACTIVE_CHANNELS)``, the ranges + //! ``[sample_begin, sample_end)`` and + //! ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way. + //! - ``cuda::std::common_type`` must be valid, and both LevelT + //! and SampleT must be valid arithmetic types. The common type must be + //! convertible to ``int`` and trivially copyable. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the computation of three 256-bin + //! *RGB* histograms from a 2x3 region of interest of within a flattened 2x4 + //! array of quad-channel *RGBA* pixels (8 bits per channel per pixel). + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for input + //! // samples and output histograms + //! int num_row_pixels; // e.g., 3 + //! int num_rows; // e.g., 2 + //! size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + //! unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), + //! // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] + //! int* d_histogram[3]; // e.g., three device pointers to three device buffers, + //! // each allocated with 256 integer counters + //! int num_levels[3]; // e.g., {257, 257, 257}; + //! unsigned int lower_level[3]; // e.g., {0, 0, 0}; + //! unsigned int upper_level[3]; // e.g., {256, 256, 256}; + //! ... + //! + //! // Determine temporary device storage requirements + //! void* d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceHistogram::MultiHistogramEven<4, 3>( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, lower_level, upper_level, + //! num_row_pixels, num_rows, row_stride_bytes); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Compute histograms + //! cub::DeviceHistogram::MultiHistogramEven<4, 3>( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, lower_level, upper_level, + //! num_row_pixels, num_rows, row_stride_bytes); + //! + //! // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + //! // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + //! // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] + //! + //! @endrst + //! + //! @tparam NUM_CHANNELS + //! Number of channels interleaved in the input data (may be greater than + //! the number of channels being actively histogrammed) + //! + //! @tparam NUM_ACTIVE_CHANNELS + //! **[inferred]** Number of channels actively being histogrammed + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input + //! samples. @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, + //! pointer differences, etc. @offset_size1 + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no + //! work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_samples + //! The pointer to the multi-channel input sequence of data samples. The + //! samples from different channels are assumed to be interleaved (e.g., + //! an array of 32-bit pixels where each pixel consists of four + //! *RGBA* 8-bit samples). + //! + //! @param[out] d_histogram + //! @rst + //! The pointers to the histogram counter output arrays, one for each + //! active channel. For channel\ :sub:`i`, the allocation length + //! of ``d_histogram[i]`` should be ``num_levels[i] - 1``. + //! @endrst + //! + //! @param[in] num_levels + //! @rst + //! The number of boundaries (levels) for delineating histogram samples in each active channel. + //! Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! @endrst + //! + //! @param[in] lower_level + //! The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + //! + //! @param[in] upper_level + //! The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + //! + //! @param[in] num_row_pixels + //! The number of multi-channel pixels per row in the region of interest + //! + //! @param[in] num_rows + //! The number of rows in the region of interest + //! + //! @param[in] row_stride_bytes + //! The number of bytes between starts of consecutive rows in the region of + //! interest + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template i is `[level[i], level[i+1])` - * - The range `[d_histogram, d_histogram + num_levels - 1)` shall not - * overlap `[d_samples, d_samples + num_samples)` nor - * `[d_levels, d_levels + num_levels)` in any way. The ranges - * `[d_levels, d_levels + num_levels)` and - * `[d_samples, d_samples + num_samples)` may overlap. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the computation of an six-bin histogram - * from a sequence of float samples - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input - * // samples and output histogram - * int num_samples; // e.g., 10 - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] - * int* d_histogram; // e.g., [ -, -, -, -, -, -] - * int num_levels // e.g., 7 (seven level boundaries for six bins) - * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramRange( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_samples); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramRange( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_samples); - * - * // d_histogram <-- [1, 5, 0, 3, 0, 0]; - * - * @endcode - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading - * input samples.\iterator - * - * @tparam CounterT - * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT - * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, - * pointer differences, etc. \offset_size1 - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_samples - * The pointer to the input sequence of data samples. - * - * @param[out] d_histogram - * The pointer to the histogram counter output array of length - * `num_levels - 1`. - * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples. - * Implies that the number of bins is `num_levels - 1`. - * - * @param[in] d_levels - * The pointer to the array of boundaries (levels). Bin ranges are defined - * by consecutive boundary pairings: lower sample value boundaries are - * inclusive and upper sample value boundaries are exclusive. - * - * @param[in] num_samples - * The number of data samples per row in the region of interest - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. + //! + //! - The number of histogram bins is (``num_levels - 1``) + //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])`` + //! - The range ``[d_histogram, d_histogram + num_levels - 1)`` shall not + //! overlap ``[d_samples, d_samples + num_samples)`` nor + //! ``[d_levels, d_levels + num_levels)`` in any way. The ranges + //! ``[d_levels, d_levels + num_levels)`` and + //! ``[d_samples, d_samples + num_samples)`` may overlap. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the computation of an six-bin histogram + //! from a sequence of float samples + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for input + //! // samples and output histogram + //! int num_samples; // e.g., 10 + //! float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] + //! int* d_histogram; // e.g., [ -, -, -, -, -, -] + //! int num_levels // e.g., 7 (seven level boundaries for six bins) + //! float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] + //! ... + //! + //! // Determine temporary device storage requirements + //! void* d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceHistogram::HistogramRange( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, d_levels, num_samples); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Compute histograms + //! cub::DeviceHistogram::HistogramRange( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, d_levels, num_samples); + //! + //! // d_histogram <-- [1, 5, 0, 3, 0, 0]; + //! + //! @endrst + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading + //! input samples. @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, + //! pointer differences, etc. @offset_size1 + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_samples + //! The pointer to the input sequence of data samples. + //! + //! @param[out] d_histogram + //! The pointer to the histogram counter output array of length + //! `num_levels - 1`. + //! + //! @param[in] num_levels + //! The number of boundaries (levels) for delineating histogram samples. + //! Implies that the number of bins is `num_levels - 1`. + //! + //! @param[in] d_levels + //! The pointer to the array of boundaries (levels). Bin ranges are defined + //! by consecutive boundary pairings: lower sample value boundaries are + //! inclusive and upper sample value boundaries are exclusive. + //! + //! @param[in] num_samples + //! The number of data samples per row in the region of interest + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template i is `[level[i], level[i+1])` - * - For a given row `r` in `[0, num_rows)`, let - * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and - * `row_end = row_begin + num_row_samples`. The range - * `[d_histogram, d_histogram + num_levels - 1)` shall not overlap - * `[row_begin, row_end)` nor `[d_levels, d_levels + num_levels)`. - * The ranges `[d_levels, d_levels + num_levels)` and `[row_begin, row_end)` - * may overlap. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the computation of a six-bin histogram - * from a 2x5 region of interest within a flattened 2x7 array of float samples. - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples and - * // output histogram - * int num_row_samples; // e.g., 5 - * int num_rows; // e.g., 2; - * int row_stride_bytes; // e.g., 7 * sizeof(float) - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, - * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] - * int* d_histogram; // e.g., [ -, -, -, -, -, -] - * int num_levels // e.g., 7 (seven level boundaries for six bins) - * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramRange( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, - * num_row_samples, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramRange( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, - * num_row_samples, num_rows, row_stride_bytes); - * - * // d_histogram <-- [1, 5, 0, 3, 0, 0]; - * @endcode - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading - * input samples. \iterator - * - * @tparam CounterT - * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT - * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, - * pointer differences, etc. \offset_size1 - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_samples - * The pointer to the input sequence of data samples. - * - * @param[out] d_histogram - * The pointer to the histogram counter output array of length - * `num_levels - 1`. - * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples. - * Implies that the number of bins is `num_levels - 1`. - * - * @param[in] d_levels - * The pointer to the array of boundaries (levels). Bin ranges are defined - * by consecutive boundary pairings: lower sample value boundaries are - * inclusive and upper sample value boundaries are exclusive. - * - * @param[in] num_row_samples - * The number of data samples per row in the region of interest - * - * @param[in] num_rows - * The number of rows in the region of interest - * - * @param[in] row_stride_bytes - * The number of bytes between starts of consecutive rows in the region - * of interest - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. + //! + //! - A two-dimensional *region of interest* within ``d_samples`` can be + //! specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. + //! - The row stride must be a whole multiple of the sample data type + //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. + //! - The number of histogram bins is (``num_levels - 1``) + //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])`` + //! - For a given row ``r`` in ``[0, num_rows)``, let + //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and + //! ``row_end = row_begin + num_row_samples``. The range + //! ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap + //! ``[row_begin, row_end)`` nor ``[d_levels, d_levels + num_levels)``. + //! The ranges ``[d_levels, d_levels + num_levels)`` and ``[row_begin, row_end)`` may overlap. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the computation of a six-bin histogram + //! from a 2x5 region of interest within a flattened 2x7 array of float samples. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for input samples and + //! // output histogram + //! int num_row_samples; // e.g., 5 + //! int num_rows; // e.g., 2; + //! int row_stride_bytes; // e.g., 7 * sizeof(float) + //! float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, + //! // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] + //! int* d_histogram; // e.g., [ -, -, -, -, -, -] + //! int num_levels // e.g., 7 (seven level boundaries for six bins) + //! float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] + //! ... + //! + //! // Determine temporary device storage requirements + //! void* d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceHistogram::HistogramRange( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, d_levels, + //! num_row_samples, num_rows, row_stride_bytes); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Compute histograms + //! cub::DeviceHistogram::HistogramRange( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, d_levels, + //! num_row_samples, num_rows, row_stride_bytes); + //! + //! // d_histogram <-- [1, 5, 0, 3, 0, 0]; + //! + //! @endrst + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading + //! input samples. @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, + //! pointer differences, etc. @offset_size1 + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no + //! work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_samples + //! The pointer to the input sequence of data samples. + //! + //! @param[out] d_histogram + //! The pointer to the histogram counter output array of length + //! `num_levels - 1`. + //! + //! @param[in] num_levels + //! The number of boundaries (levels) for delineating histogram samples. + //! Implies that the number of bins is `num_levels - 1`. + //! + //! @param[in] d_levels + //! The pointer to the array of boundaries (levels). Bin ranges are defined + //! by consecutive boundary pairings: lower sample value boundaries are + //! inclusive and upper sample value boundaries are exclusive. + //! + //! @param[in] num_row_samples + //! The number of data samples per row in the region of interest + //! + //! @param[in] num_rows + //! The number of rows in the region of interest + //! + //! @param[in] row_stride_bytes + //! The number of bytes between starts of consecutive rows in the region + //! of interest + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template i is - * `num_levels[i] - 1`. - * - For channeli, the range of values for all histogram - * bins have the same width: - * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` - * - For given channels `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the - * range `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall - * not overlap `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` nor - * `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. - * The ranges `[d_levels[c2], d_levels[c2] + num_levels[c2])` and - * `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` may overlap. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the computation of three 4-bin *RGB* - * histograms from a quad-channel sequence of *RGBA* pixels - * (8 bits per channel per pixel) - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input samples and output histograms - * int num_pixels; // e.g., 5 - * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), - * // (0, 6, 7, 5),(3, 0, 2, 6)] - * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; - * int num_levels[3]; // e.g., {5, 5, 5}; - * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8] ]; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramRange<4, 3>( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_pixels); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramRange<4, 3>( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_pixels); - * - * // d_histogram <-- [ [1, 3, 0, 1], - * // [3, 0, 0, 2], - * // [0, 2, 0, 3] ] - * - * @endcode - * - * @tparam NUM_CHANNELS - * Number of channels interleaved in the input data (may be greater than - * the number of channels being actively histogrammed) - * - * @tparam NUM_ACTIVE_CHANNELS - * **[inferred]** Number of channels actively being histogrammed - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading - * input samples. \iterator - * - * @tparam CounterT - * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT - * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, - * pointer differences, etc. \offset_size1 - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_samples - * The pointer to the multi-channel input sequence of data samples. - * The samples from different channels are assumed to be interleaved (e.g., - * an array of 32-bit pixels where each pixel consists of four *RGBA* - * 8-bit samples). - * - * @param[out] d_histogram - * The pointers to the histogram counter output arrays, one for each active - * channel. For channeli, the allocation length of - * `d_histogram[i]` should be `num_levels[i] - 1`. - * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples in - * each active channel. Implies that the number of bins for - * channeli is `num_levels[i] - 1`. - * - * @param[in] d_levels - * The pointers to the arrays of boundaries (levels), one for each active - * channel. Bin ranges are defined by consecutive boundary pairings: lower - * sample value boundaries are inclusive and upper sample value boundaries - * are exclusive. - * - * @param[in] num_pixels - * The number of multi-channel pixels - * (i.e., the length of `d_samples / NUM_CHANNELS`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples + //! using the specified bin boundary levels. + //! + //! - The input is a sequence of *pixel* structures, where each pixel + //! comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel). + //! - Of the ``NUM_CHANNELS`` specified, the function will only compute + //! histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples). + //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: + //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` + //! - For given channels ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the + //! range ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall + //! not overlap ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` nor + //! ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way. + //! The ranges ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and + //! ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` may overlap. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the computation of three 4-bin *RGB* + //! histograms from a quad-channel sequence of *RGBA* pixels + //! (8 bits per channel per pixel) + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input samples and output histograms + //! int num_pixels; // e.g., 5 + //! unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), + //! // (0, 6, 7, 5),(3, 0, 2, 6)] + //! unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + //! int num_levels[3]; // e.g., {5, 5, 5}; + //! unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + //! // [0, 2, 4, 6, 8], + //! // [0, 2, 4, 6, 8] ]; + //! ... + //! + //! // Determine temporary device storage requirements + //! void* d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceHistogram::MultiHistogramRange<4, 3>( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, d_levels, num_pixels); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Compute histograms + //! cub::DeviceHistogram::MultiHistogramRange<4, 3>( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, d_levels, num_pixels); + //! + //! // d_histogram <-- [ [1, 3, 0, 1], + //! // [3, 0, 0, 2], + //! // [0, 2, 0, 3] ] + //! + //! @endrst + //! + //! @tparam NUM_CHANNELS + //! Number of channels interleaved in the input data (may be greater than + //! the number of channels being actively histogrammed) + //! + //! @tparam NUM_ACTIVE_CHANNELS + //! **[inferred]** Number of channels actively being histogrammed + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading + //! input samples. @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, + //! pointer differences, etc. @offset_size1 + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no + //! work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_samples + //! The pointer to the multi-channel input sequence of data samples. + //! The samples from different channels are assumed to be interleaved (e.g., + //! an array of 32-bit pixels where each pixel consists of four *RGBA* + //! 8-bit samples). + //! + //! @param[out] d_histogram + //! @rst + //! The pointers to the histogram counter output arrays, one for each active + //! channel. For channel\ :sub:`i`, the allocation length of + //! ``d_histogram[i]`` should be ``num_levels[i] - 1``. + //! @endrst + //! + //! @param[in] num_levels + //! @rst + //! The number of boundaries (levels) for delineating histogram samples in + //! each active channel. Implies that the number of bins for + //! channel\ :sub:`i` is ``num_levels[i] - 1``. + //! @endrst + //! + //! @param[in] d_levels + //! The pointers to the arrays of boundaries (levels), one for each active + //! channel. Bin ranges are defined by consecutive boundary pairings: lower + //! sample value boundaries are inclusive and upper sample value boundaries + //! are exclusive. + //! + //! @param[in] num_pixels + //! The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template i is - * `num_levels[i] - 1`. - * - For channeli, the range of values for all histogram - * bins have the same width: - * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` - * - For a given row `r` in `[0, num_rows)`, and sample `s` in - * `[0, num_row_pixels)`, let - * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, - * `sample_begin = row_begin + s * NUM_CHANNELS`, and - * `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For given channels - * `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the range - * `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall not - * overlap `[sample_begin, sample_end)` nor - * `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. The ranges - * `[d_levels[c2], d_levels[c2] + num_levels[c2])` and - * `[sample_begin, sample_end)` may overlap. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the computation of three 4-bin *RGB* - * histograms from a 2x3 region of interest of within a flattened 2x4 array - * of quad-channel *RGBA* pixels (8 bits per channel per pixel). - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input - * // samples and output histograms - * int num_row_pixels; // e.g., 3 - * int num_rows; // e.g., 2 - * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS - * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), - * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] - * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; - * int num_levels[3]; // e.g., {5, 5, 5}; - * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8] ]; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramRange<4, 3>( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, - * num_row_pixels, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramRange<4, 3>( - * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, - * d_levels, num_row_pixels, num_rows, row_stride_bytes); - * - * // d_histogram <-- [ [2, 3, 0, 1], - * // [3, 0, 0, 2], - * // [1, 2, 0, 3] ] - * - * @endcode - * - * @tparam NUM_CHANNELS - * Number of channels interleaved in the input data (may be greater than - * the number of channels being actively histogrammed) - * - * @tparam NUM_ACTIVE_CHANNELS - * **[inferred]** Number of channels actively being histogrammed - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading input - * samples. \iterator - * - * @tparam CounterT - * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT - * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, - * pointer differences, etc. \offset_size1 - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to \p temp_storage_bytes and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_samples - * The pointer to the multi-channel input sequence of data samples. The - * samples from different channels are assumed to be interleaved (e.g., an - * array of 32-bit pixels where each pixel consists of four - * *RGBA* 8-bit samples). - * - * @param[out] d_histogram - * The pointers to the histogram counter output arrays, one for each active - * channel. For channeli, the allocation length of - * `d_histogram[i]` should be `num_levels[i] - 1`. - * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples in - * each active channel. Implies that the number of bins for - * channeli is `num_levels[i] - 1`. - * - * @param[in] d_levels - * The pointers to the arrays of boundaries (levels), one for each active - * channel. Bin ranges are defined by consecutive boundary pairings: lower - * sample value boundaries are inclusive and upper sample value boundaries - * are exclusive. - * - * @param[in] num_row_pixels - * The number of multi-channel pixels per row in the region of interest - * - * @param[in] num_rows - * The number of rows in the region of interest - * - * @param[in] row_stride_bytes - * The number of bytes between starts of consecutive rows in the - * region of interest - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using + //! the specified bin boundary levels. + //! + //! - The input is a sequence of *pixel* structures, where each pixel comprises + //! a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel). + //! - Of the ``NUM_CHANNELS`` specified, the function will only compute + //! histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples). + //! - A two-dimensional *region of interest* within ``d_samples`` can be + //! specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. + //! - The row stride must be a whole multiple of the sample data type + //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. + //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: + //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` + //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in ``[0, num_row_pixels)``, let + //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``, + //! ``sample_begin = row_begin + s * NUM_CHANNELS``, and + //! ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For given channels + //! ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the range + //! ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall not overlap + //! ``[sample_begin, sample_end)`` nor + //! ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way. The ranges + //! ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and + //! ``[sample_begin, sample_end)`` may overlap. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates the computation of three 4-bin *RGB* + //! histograms from a 2x3 region of interest of within a flattened 2x4 array + //! of quad-channel *RGBA* pixels (8 bits per channel per pixel). + //! + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for input + //! // samples and output histograms + //! int num_row_pixels; // e.g., 3 + //! int num_rows; // e.g., 2 + //! size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + //! unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), + //! // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] + //! int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + //! int num_levels[3]; // e.g., {5, 5, 5}; + //! unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + //! // [0, 2, 4, 6, 8], + //! // [0, 2, 4, 6, 8] ]; + //! ... + //! + //! // Determine temporary device storage requirements + //! void* d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceHistogram::MultiHistogramRange<4, 3>( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, d_levels, + //! num_row_pixels, num_rows, row_stride_bytes); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Compute histograms + //! cub::DeviceHistogram::MultiHistogramRange<4, 3>( + //! d_temp_storage, temp_storage_bytes, + //! d_samples, d_histogram, num_levels, + //! d_levels, num_row_pixels, num_rows, row_stride_bytes); + //! + //! // d_histogram <-- [ [2, 3, 0, 1], + //! // [3, 0, 0, 2], + //! // [1, 2, 0, 3] ] + //! + //! @endrst + //! + //! @tparam NUM_CHANNELS + //! Number of channels interleaved in the input data (may be greater than + //! the number of channels being actively histogrammed) + //! + //! @tparam NUM_ACTIVE_CHANNELS + //! **[inferred]** Number of channels actively being histogrammed + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input + //! samples. @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, + //! pointer differences, etc. @offset_size1 + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_samples + //! The pointer to the multi-channel input sequence of data samples. The + //! samples from different channels are assumed to be interleaved (e.g., an + //! array of 32-bit pixels where each pixel consists of four + //! *RGBA* 8-bit samples). + //! + //! @param[out] d_histogram + //! @rst + //! The pointers to the histogram counter output arrays, one for each active + //! channel. For channel\ :sub:`i`, the allocation length of + //! ``d_histogram[i]`` should be ``num_levels[i] - 1``. + //! @endrst + //! + //! @param[in] num_levels + //! @rst + //! The number of boundaries (levels) for delineating histogram samples in + //! each active channel. Implies that the number of bins for + //! channel\ :sub:`i` is ``num_levels[i] - 1``. + //! @endrst + //! + //! @param[in] d_levels + //! The pointers to the arrays of boundaries (levels), one for each active + //! channel. Bin ranges are defined by consecutive boundary pairings: lower + //! sample value boundaries are inclusive and upper sample value boundaries + //! are exclusive. + //! + //! @param[in] num_row_pixels + //! The number of multi-channel pixels per row in the region of interest + //! + //! @param[in] num_rows + //! The number of rows in the region of interest + //! + //! @param[in] row_stride_bytes + //! The number of bytes between starts of consecutive rows in the + //! region of interest + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template [inferred] Device-accessible random-access input iterator type - * providing the pointers to the source memory buffers - * @tparam OutputBufferIt [inferred] Device-accessible random-access input iterator type - * providing the pointers to the destination memory buffers - * @tparam BufferSizeIteratorT [inferred] Device-accessible random-access input iterator - * type providing the number of bytes to be copied for each pair of buffers - * @param d_temp_storage [in] Device-accessible allocation of temporary storage. When NULL, the - * required allocation size is written to \p temp_storage_bytes and no work is done. - * @param temp_storage_bytes [in,out] Reference to size in bytes of \p d_temp_storage allocation - * @param input_buffer_it [in] Device-accessible iterator providing the pointers to the source - * memory buffers - * @param output_buffer_it [in] Device-accessible iterator providing the pointers to the - * destination memory buffers - * @param buffer_sizes [in] Device-accessible iterator providing the number of bytes to be copied - * for each pair of buffers - * @param num_buffers [in] The total number of buffer pairs - * @param stream [in] [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Copies data from a batch of given source buffers to their corresponding destination buffer. + //! + //! .. note:: + //! + //! If any input buffer aliases memory from any output buffer the behavior is undefined. + //! If any output buffer aliases memory of another output buffer the behavior is undefined. + //! Input buffers can alias one another. + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing + //! a single string buffer. + //! + //! .. code-block:: c++ + //! + //! struct GetPtrToStringItem + //! { + //! __host__ __device__ __forceinline__ void *operator()(uint32_t index) + //! { + //! return &d_string_data_in[d_string_offsets[index]]; + //! } + //! char *d_string_data_in; + //! uint32_t *d_string_offsets; + //! }; + //! + //! struct GetStringItemSize + //! { + //! __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index) + //! { + //! return d_string_offsets[index + 1] - d_string_offsets[index]; + //! } + //! uint32_t *d_string_offsets; + //! }; + //! + //! uint32_t num_strings = 5; + //! char *d_string_data_in; // e.g., "TomatoesBananasApplesOrangesGrapes" + //! char *d_string_data_out; // e.g., " ... " + //! uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34] + //! uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34] + //! uint32_t *d_gather_index; // e.g., [2, 1, 4, 3, 0] + //! + //! // Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced + //! auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0), + //! d_gather_index); + //! + //! // Returns pointers to the input buffer for each string + //! auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator, + //! GetPtrToStringItem{d_string_data_in, + //! d_string_offsets_old}); + //! + //! // Returns the string size of the i-th string + //! auto str_sizes = thrust::make_transform_iterator(gather_iterator, + //! GetStringItemSize{d_string_offsets_old}); + //! + //! // Returns pointers to the output buffer for each string + //! auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + //! GetPtrToStringItem{d_string_data_out, + //! d_string_offsets_new}); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = nullptr; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out, + //! str_sizes, num_strings); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run batched copy algorithm (used to permute strings) + //! cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out, + //! str_sizes, num_strings); + //! + //! // d_string_data_out <-- "ApplesBananasGrapesOrangesTomatoe" + //! + //! @endrst + //! + //! @tparam InputBufferIt + //! **[inferred]** Device-accessible random-access input iterator type providing the pointers to + //! the source memory buffers + //! + //! @tparam OutputBufferIt + //! **[inferred]** Device-accessible random-access input iterator type providing the pointers to + //! the destination memory buffers + //! + //! @tparam BufferSizeIteratorT + //! **[inferred]** Device-accessible random-access input iterator type providing the number of bytes + //! to be copied for each pair of buffers + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] input_buffer_it + //! Device-accessible iterator providing the pointers to the source memory buffers + //! + //! @param[in] output_buffer_it + //! Device-accessible iterator providing the pointers to the destination memory buffers + //! + //! @param[in] buffer_sizes + //! Device-accessible iterator providing the number of bytes to be copied for each pair of buffers + //! + //! @param[in] num_buffers + //! The total number of buffer pairs + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Batched(void *d_temp_storage, size_t &temp_storage_bytes, diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh index 84ef2b3d2f..e68dcaa46e 100644 --- a/cub/cub/device/device_merge_sort.cuh +++ b/cub/cub/device/device_merge_sort.cuh @@ -49,8 +49,6 @@ CUB_NAMESPACE_BEGIN * computing a merge sort across a sequence of data items residing within * device-accessible memory. * - * @ingroup SingleModule - * * @par Overview * - DeviceMergeSort arranges items into ascending order using a comparison * functor with less-than semantics. Merge sort can handle arbitrary types (as diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index ee3721f70f..eb3c978a2e 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -26,11 +26,8 @@ * ******************************************************************************/ -/** - * @file - * cub::DevicePartition provides device-wide, parallel operations for - * partitioning sequences of data items residing within device-accessible memory. - */ +//! @file cub::DevicePartition provides device-wide, parallel operations for +//! partitioning sequences of data items residing within device-accessible memory. #pragma once @@ -54,132 +51,122 @@ CUB_NAMESPACE_BEGIN -/** - * @brief DevicePartition provides device-wide, parallel operations for - * partitioning sequences of data items residing within device-accessible - * memory. ![](partition_logo.png) - * @ingroup SingleModule - * - * @par Overview - * These operations apply a selection criterion to construct a partitioned - * output sequence from items selected/unselected from a specified input - * sequence. - * - * @par Usage Considerations - * \cdp_class{DevicePartition} - * - * @par Performance - * \linear_performance{partition} - * - * @par - * The following chart illustrates DevicePartition::If - * performance across different CUDA architectures for @p int32 items, - * where 50% of the items are randomly selected for the first partition. - * \plots_below - * - * @image html partition_if_int32_50_percent.png - * - */ +//! @rst +//! DevicePartition provides device-wide, parallel operations for +//! partitioning sequences of data items residing within device-accessible memory. +//! +//! Overview +//! ++++++++++++++++++++++++++ +//! +//! These operations apply a selection criterion to construct a partitioned +//! output sequence from items selected/unselected from a specified input +//! sequence. +//! +//! Usage Considerations +//! ++++++++++++++++++++++++++ +//! +//! @cdp_class{DevicePartition} +//! +//! Performance +//! ++++++++++++++++++++++++++ +//! +//! @linear_performance{partition} +//! +//! @endrst struct DevicePartition { - /** - * @brief Uses the @p d_flags sequence to split the corresponding items from - * @p d_in into a partitioned sequence @p d_out. The total number of - * items copied into the first partition is written to - * @p d_num_selected_out. ![](partition_flags_logo.png) - * - * @par - * - The value type of @p d_flags must be castable to @p bool (e.g., - * @p bool, @p char, @p int, etc.). - * - Copies of the selected items are compacted into @p d_out and maintain - * their original relative ordering, however copies of the unselected - * items are compacted into the rear of @p d_out in reverse order. - * - The range `[d_out, d_out + num_items)` shall not overlap - * `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any - * way. The range `[d_in, d_in + num_items)` may overlap - * `[d_flags, d_flags + num_items)`. - * - \devicestorage - * - * @par Snippet - * The code snippet below illustrates the compaction of items selected from - * an @p int device vector. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input, flags, and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = nullptr; - * std::size_t temp_storage_bytes = 0; - * cub::DevicePartition::Flagged( - * d_temp_storage, temp_storage_bytes, - * d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DevicePartition::Flagged( - * d_temp_storage, temp_storage_bytes, - * d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] - * // d_num_selected_out <-- [4] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading - * input items \iterator - * - * @tparam FlagIterator - * **[inferred]** Random-access input iterator type for reading - * selection flags \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing - * output items \iterator - * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number - * of items selected \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to @p temp_storage_bytes and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[in] d_flags - * Pointer to the input sequence of selection flags - * - * @param[out] d_out - * Pointer to the output sequence of partitioned data items - * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected (i.e., the - * offset of the unselected partition) - * - * @param[in] num_items - * Total number of items to select from - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Uses the ``d_flags`` sequence to split the corresponding items from + //! ``d_in`` into a partitioned sequence ``d_out``. + //! The total number of items copied into the first partition is written to ``d_num_selected_out``. + //! + //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.). + //! - Copies of the selected items are compacted into ``d_out`` and maintain + //! their original relative ordering, however copies of the unselected + //! items are compacted into the rear of ``d_out`` in reverse order. + //! - The range ``[d_out, d_out + num_items)`` shall not overlap + //! ``[d_in, d_in + num_items)`` nor ``[d_flags, d_flags + num_items)`` in any way. + //! The range ``[d_in, d_in + num_items)`` may overlap ``[d_flags, d_flags + num_items)``. + //! - @devicestorage + //! + //! Snippet + //! ++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input, flags, and output + //! int num_items; // e.g., 8 + //! int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + //! char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + //! int *d_out; // e.g., [ , , , , , , , ] + //! int *d_num_selected_out; // e.g., [ ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = nullptr; + //! std::size_t temp_storage_bytes = 0; + //! cub::DevicePartition::Flagged( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_flags, d_out, d_num_selected_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DevicePartition::Flagged( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_flags, d_out, d_num_selected_out, num_items); + //! + //! // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] + //! // d_num_selected_out <-- [4] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam FlagIterator + //! **[inferred]** Random-access input iterator type for reading selection flags @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output items @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items selected @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[in] d_flags + //! Pointer to the input sequence of selection flags + //! + //! @param[out] d_out + //! Pointer to the output sequence of partitioned data items + //! + //! @param[out] d_num_selected_out + //! Pointer to the output total number of items selected (i.e., the + //! offset of the unselected partition) + //! + //! @param[in] num_items + //! Total number of items to select from + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * explicit LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const - * { - * return (a < compare); - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * LessThan select_op(7); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = nullptr; - * std::size_t temp_storage_bytes = 0; - * cub::DevicePartition::If( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DevicePartition::If( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] - * // d_num_selected_out <-- [5] - * - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing output - * items \iterator - * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items - * selected \iterator - * - * @tparam SelectOp - * **[inferred]** Selection functor type having member - * `bool operator()(const T &a)` - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output sequence of partitioned data items - * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected (i.e., the - * offset of the unselected partition) - * - * @param[in] num_items - * Total number of items to select from - * - * @param[in] select_op - * Unary selection operator - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into + //! a partitioned sequence ``d_out``. The total number of items copied into the first partition is written + //! to ``d_num_selected_out``. + //! + //! - Copies of the selected items are compacted into ``d_out`` and maintain + //! their original relative ordering, however copies of the unselected + //! items are compacted into the rear of ``d_out`` in reverse order. + //! - The range ``[d_out, d_out + num_items)`` shall not overlap + //! ``[d_in, d_in + num_items)`` in any way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Functor type for selecting values less than some criteria + //! struct LessThan + //! { + //! int compare; + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! explicit LessThan(int compare) : compare(compare) {} + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! bool operator()(const int &a) const + //! { + //! return (a < compare); + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 8 + //! int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + //! int *d_out; // e.g., [ , , , , , , , ] + //! int *d_num_selected_out; // e.g., [ ] + //! LessThan select_op(7); + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = nullptr; + //! std::size_t temp_storage_bytes = 0; + //! cub::DevicePartition::If( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, d_num_selected_out, num_items, select_op); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DevicePartition::If( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, d_num_selected_out, num_items, select_op); + //! + //! // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] + //! // d_num_selected_out <-- [5] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output items @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items selected @iterator + //! + //! @tparam SelectOp + //! **[inferred]** Selection functor type having member `bool operator()(const T &a)` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of ``d_temp_storage`` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output sequence of partitioned data items + //! + //! @param[out] d_num_selected_out + //! Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + //! + //! @param[in] num_items + //! Total number of items to select from + //! + //! @param[in] select_op + //! Unary selection operator + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * explicit LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const - * { - * return a < compare; - * } - * }; - * - * // Functor type for selecting values greater than some criteria - * struct GreaterThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * explicit GreaterThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const - * { - * return a > compare; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_large_and_unselected_out; // e.g., [ , , , , , , , ] - * int *d_small_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ , ] - * thrust::reverse_iterator unselected_out(d_large_and_unselected_out + num_items); - * LessThan small_items_selector(7); - * GreaterThan large_items_selector(50); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = nullptr; - * std::size_t temp_storage_bytes = 0; - * cub::DevicePartition::If( - * d_temp_storage, temp_storage_bytes, - * d_in, d_large_and_medium_out, d_small_out, unselected_out, - * d_num_selected_out, num_items, - * large_items_selector, small_items_selector); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DevicePartition::If( - * d_temp_storage, temp_storage_bytes, - * d_in, d_large_and_medium_out, d_small_out, unselected_out, - * d_num_selected_out, num_items, - * large_items_selector, small_items_selector); - * - * // d_large_and_unselected_out <-- [ 81, , , , , , 8, 9 ] - * // d_small_out <-- [ 0, 2, 3, 5, 2, , , ] - * // d_num_selected_out <-- [ 1, 5 ] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading - * input items \iterator - * - * @tparam FirstOutputIteratorT - * **[inferred]** Random-access output iterator type for writing output - * items selected by first operator \iterator - * - * @tparam SecondOutputIteratorT - * **[inferred]** Random-access output iterator type for writing output - * items selected by second operator \iterator - * - * @tparam UnselectedOutputIteratorT - * **[inferred]** Random-access output iterator type for writing - * unselected items \iterator - * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items - * selected \iterator - * - * @tparam SelectFirstPartOp - * **[inferred]** Selection functor type having member - * `bool operator()(const T &a)` - * - * @tparam SelectSecondPartOp - * **[inferred]** Selection functor type having member - * `bool operator()(const T &a)` - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to @p temp_storage_bytes and - * no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_first_part_out - * Pointer to the output sequence of data items selected by - * @p select_first_part_op - * - * @param[out] d_second_part_out - * Pointer to the output sequence of data items selected by - * @p select_second_part_op - * - * @param[out] d_unselected_out - * Pointer to the output sequence of unselected data items - * - * @param[out] d_num_selected_out - * Pointer to the output array with two elements, where total number of - * items selected by @p select_first_part_op is stored as - * `d_num_selected_out[0]` and total number of items selected by - * @p select_second_part_op is stored as `d_num_selected_out[1]`, - * respectively - * - * @param[in] num_items - * Total number of items to select from - * - * @param[in] select_first_part_op - * Unary selection operator to select @p d_first_part_out - * - * @param[in] select_second_part_op - * Unary selection operator to select @p d_second_part_out - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Uses two functors to split the corresponding items from ``d_in`` into a three partitioned sequences + //! ``d_first_part_out``, ``d_second_part_out``, and ``d_unselected_out``. + //! The total number of items copied into the first partition is written + //! to ``d_num_selected_out[0]``, while the total number of items copied into the second partition is written + //! to ``d_num_selected_out[1]``. + //! + //! - Copies of the items selected by ``select_first_part_op`` are compacted + //! into ``d_first_part_out`` and maintain their original relative ordering. + //! - Copies of the items selected by ``select_second_part_op`` are compacted + //! into ``d_second_part_out`` and maintain their original relative ordering. + //! - Copies of the unselected items are compacted into the ``d_unselected_out`` in reverse order. + //! - The ranges ``[d_out, d_out + num_items)``, + //! ``[d_first_part_out, d_first_part_out + d_num_selected_out[0])``, + //! ``[d_second_part_out, d_second_part_out + d_num_selected_out[1])``, + //! ``[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])``, + //! shall not overlap in any way. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates how this algorithm can partition an + //! input vector into small, medium, and large items so that the relative + //! order of items remain deterministic. + //! + //! Let's consider any value that doesn't exceed six a small one. On the + //! other hand, any value that exceeds 50 will be considered a large one. + //! Since the value used to define a small part doesn't match one that + //! defines the large part, the intermediate segment is implied. + //! + //! These definitions partition a value space into three categories. We want + //! to preserve the order of items in which they appear in the input vector. + //! Since the algorithm provides stable partitioning, this is possible. + //! + //! Since the number of items in each category is unknown beforehand, we need + //! three output arrays of num_items elements each. To reduce the memory + //! requirements, we can combine the output storage for two categories. + //! + //! Since each value falls precisely in one category, it's safe to add + //! "large" values into the head of the shared output vector and the "middle" + //! values into its tail. To add items into the tail of the output array, we + //! can use ``thrust::reverse_iterator``. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Functor type for selecting values less than some criteria + //! struct LessThan + //! { + //! int compare; + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! explicit LessThan(int compare) : compare(compare) {} + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! bool operator()(const int &a) const + //! { + //! return a < compare; + //! } + //! }; + //! + //! // Functor type for selecting values greater than some criteria + //! struct GreaterThan + //! { + //! int compare; + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! explicit GreaterThan(int compare) : compare(compare) {} + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! bool operator()(const int &a) const + //! { + //! return a > compare; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 8 + //! int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + //! int *d_large_and_unselected_out; // e.g., [ , , , , , , , ] + //! int *d_small_out; // e.g., [ , , , , , , , ] + //! int *d_num_selected_out; // e.g., [ , ] + //! thrust::reverse_iterator unselected_out(d_large_and_unselected_out + num_items); + //! LessThan small_items_selector(7); + //! GreaterThan large_items_selector(50); + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = nullptr; + //! std::size_t temp_storage_bytes = 0; + //! cub::DevicePartition::If( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_large_and_medium_out, d_small_out, unselected_out, + //! d_num_selected_out, num_items, + //! large_items_selector, small_items_selector); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DevicePartition::If( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_large_and_medium_out, d_small_out, unselected_out, + //! d_num_selected_out, num_items, + //! large_items_selector, small_items_selector); + //! + //! // d_large_and_unselected_out <-- [ 81, , , , , , 8, 9 ] + //! // d_small_out <-- [ 0, 2, 3, 5, 2, , , ] + //! // d_num_selected_out <-- [ 1, 5 ] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam FirstOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output + //! items selected by first operator @iterator + //! + //! @tparam SecondOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output + //! items selected by second operator @iterator + //! + //! @tparam UnselectedOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing + //! unselected items @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items + //! selected @iterator + //! + //! @tparam SelectFirstPartOp + //! **[inferred]** Selection functor type having member `bool operator()(const T &a)` + //! + //! @tparam SelectSecondPartOp + //! **[inferred]** Selection functor type having member `bool operator()(const T &a)` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_first_part_out + //! Pointer to the output sequence of data items selected by `select_first_part_op` + //! + //! @param[out] d_second_part_out + //! Pointer to the output sequence of data items selected by `select_second_part_op` + //! + //! @param[out] d_unselected_out + //! Pointer to the output sequence of unselected data items + //! + //! @param[out] d_num_selected_out + //! Pointer to the output array with two elements, where total number of + //! items selected by `select_first_part_op` is stored as + //! `d_num_selected_out[0]` and total number of items selected by + //! `select_second_part_op` is stored as `d_num_selected_out[1]`, + //! respectively + //! + //! @param[in] num_items + //! Total number of items to select from + //! + //! @param[in] select_first_part_op + //! Unary selection operator to select `d_first_part_out` + //! + //! @param[in] select_second_part_op + //! Unary selection operator to select `d_second_part_out` + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -3402,7 +3401,7 @@ public: //! any of the provided ranges: //! //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` - //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)` + //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! //! * Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two @@ -3520,7 +3519,7 @@ public: //! any of the provided ranges: //! //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` - //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)` + //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify //! differentiating key bits. This can reduce overall sorting overhead and @@ -3642,11 +3641,7 @@ public: stream); } - //@} end member group + //! @} end member group }; -/** - * @example example_device_radix_sort.cu - */ - CUB_NAMESPACE_END diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index bef701684d..715c9556bc 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -26,11 +26,9 @@ * ******************************************************************************/ -/** - * @file cub::DeviceReduce provides device-wide, parallel operations for - * computing a reduction across a sequence of data items residing within - * device-accessible memory. - */ +//! @file cub::DeviceReduce provides device-wide, parallel operations for +//! computing a reduction across a sequence of data items residing within +//! device-accessible memory. #pragma once @@ -56,8 +54,6 @@ CUB_NAMESPACE_BEGIN -//! @ingroup SingleModule -//! //! @rst //! DeviceReduce provides device-wide, parallel operations for computing //! a reduction across a sequence of data items residing within @@ -68,140 +64,127 @@ CUB_NAMESPACE_BEGIN //! //! Overview //! ==================================== +//! //! A `reduction `_ //! (or *fold*) uses a binary combining operator to compute a single aggregate //! from a sequence of input elements. //! //! Usage Considerations //! ==================================== +//! //! @cdp_class{DeviceReduce} //! //! Performance //! ==================================== -//! @linear_performance{reduction, reduce-by-key, and run-length encode} -//! -//! The following chart illustrates DeviceReduce::Sum -//! performance across different CUDA architectures for \p int32 keys. -//! -//! .. image:: ../img/reduce_int32.png -//! :align: center -//! -//! @par -//! The following chart illustrates DeviceReduce::ReduceByKey (summation) -//! performance across different CUDA architectures for `fp32` values. Segments -//! are identified by `int32` keys, and have lengths uniformly sampled -//! from `[1, 1000]`. //! -//! .. image:: ../img/reduce_by_key_fp32_len_500.png -//! :align: center +//! @linear_performance{reduction, reduce-by-key, and run-length encode} //! //! @endrst struct DeviceReduce { - /** - * @brief Computes a device-wide reduction using the specified binary - * `reduction_op` functor and initial value `init`. - * - * @par - * - Does not support binary reduction operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates a user-defined min-reduction of a - * device vector of `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * __device__ __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * CustomMin min_op; - * int init; // e.g., INT_MAX - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Reduce( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, num_items, min_op, init); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduction - * cub::DeviceReduce::Reduce( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, num_items, min_op, init); - * - * // d_out <-- [0] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced - * aggregate \iterator - * - * @tparam ReductionOpT - * **[inferred]** Binary reduction functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam T - * **[inferred]** Data element type that is convertible to the `value` type - * of `InputIteratorT` - * - * @tparam NumItemsT **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param d_in[in] - * Pointer to the input sequence of data items - * - * @param d_out[out] - * Pointer to the output aggregate - * - * @param num_items[in] - * Total number of input items (i.e., length of `d_in`) - * - * @param reduction_op[in] - * Binary reduction functor - * - * @param[in] init - * Initial value of the reduction - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``. + //! + //! - Does not support binary reduction operators that are non-commutative. + //! - Provides "run-to-run" determinism for pseudo-associative reduction + //! (e.g., addition of floating point types) on the same GPU device. + //! However, results for pseudo-associative reduction may be inconsistent + //! from one device to a another device of a different compute-capability + //! because CUB can employ different tile-sizing for different architectures. + //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates a user-defined min-reduction of a + //! device vector of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! __device__ __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [-] + //! CustomMin min_op; + //! int init; // e.g., INT_MAX + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceReduce::Reduce( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, num_items, min_op, init); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run reduction + //! cub::DeviceReduce::Reduce( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, num_items, min_op, init); + //! + //! // d_out <-- [0] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator + //! + //! @tparam ReductionOpT + //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam T + //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT` + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] reduction_op + //! Binary reduction functor + //! + //! @param[in] init + //! Initial value of the reduction + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Sum( - * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sum-reduction - * cub::DeviceReduce::Sum( - * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [38] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced - * aggregate \iterator - * - * @tparam NumItemsT **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes a device-wide sum using the addition (``+``) operator. + //! + //! - Uses ``0`` as the initial value of the reduction. + //! - Does not support ``+`` operators that are non-commutative.. + //! - Provides "run-to-run" determinism for pseudo-associative reduction + //! (e.g., addition of floating point types) on the same GPU device. + //! However, results for pseudo-associative reduction may be inconsistent + //! from one device to a another device of a different compute-capability + //! because CUB can employ different tile-sizing for different architectures. + //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the sum-reduction of a device vector + //! of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [-] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceReduce::Sum( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sum-reduction + //! cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + //! + //! // d_out <-- [38] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -398,82 +373,82 @@ struct DeviceReduce stream); } - /** - * @brief Computes a device-wide minimum using the less-than ('<') operator. - * - * @par - * - Uses `std::numeric_limits::max()` as the initial value of the reduction. - * - Does not support `<` operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the min-reduction of a device vector of - * `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Min( - * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run min-reduction - * cub::DeviceReduce::Min( - * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [0] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced - * aggregate \iterator - * - * @tparam NumItemsT **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes a device-wide minimum using the less-than (``<``) operator. + //! + //! - Uses ``std::numeric_limits::max()`` as the initial value of the reduction. + //! - Does not support ``<`` operators that are non-commutative. + //! - Provides "run-to-run" determinism for pseudo-associative reduction + //! (e.g., addition of floating point types) on the same GPU device. + //! However, results for pseudo-associative reduction may be inconsistent + //! from one device to a another device of a different compute-capability + //! because CUB can employ different tile-sizing for different architectures. + //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [-] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceReduce::Min( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run min-reduction + //! cub::DeviceReduce::Min( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + //! + //! // d_out <-- [0] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -529,86 +504,84 @@ struct DeviceReduce stream); } - /** - * @brief Finds the first device-wide minimum using the less-than ('<') - * operator, also returning the index of that item. - * - * @par - * - The output value type of `d_out` is cub::KeyValuePair `` - * (assuming the value type of `d_in` is `T`) - * - The minimum is written to `d_out.value` and its offset in the input - * array is written to `d_out.key`. - * - The `{1, std::numeric_limits::max()}` tuple is produced for - * zero-length inputs - * - Does not support `<` operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector - * of `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ArgMin( - * d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmin-reduction - * cub::DeviceReduce::ArgMin( - * d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); - * - * // d_out <-- [{5, 0}] - * - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input items - * (of some type `T`) \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate - * (having value type `cub::KeyValuePair`) \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to \p temp_storage_bytes and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item. + //! + //! - The output value type of ``d_out`` is ``cub::KeyValuePair`` + //! (assuming the value type of ``d_in`` is ``T``) + //! + //! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``. + //! - The ``{1, std::numeric_limits::max()}`` tuple is produced for zero-length inputs + //! + //! - Does not support ``<`` operators that are non-commutative. + //! - Provides "run-to-run" determinism for pseudo-associative reduction + //! (e.g., addition of floating point types) on the same GPU device. + //! However, results for pseudo-associative reduction may be inconsistent + //! from one device to a another device of a different compute-capability + //! because CUB can employ different tile-sizing for different architectures. + //! - The range ``[d_in, d_in + num_items)`` shall not overlap `d_out`. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the argmin-reduction of a device vector + //! of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! KeyValuePair *d_out; // e.g., [{-,-}] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run argmin-reduction + //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + //! + //! // d_out <-- [{5, 0}] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items + //! (of some type `T`) @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate + //! (having value type `cub::KeyValuePair`) @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, @@ -680,83 +653,79 @@ struct DeviceReduce stream); } - /** - * @brief Computes a device-wide maximum using the greater-than ('>') operator. - * - * @par - * - Uses `std::numeric_limits::lowest()` as the initial value of the - * reduction. - * - Does not support `>` operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the max-reduction of a device vector of - * `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Max( - * d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run max-reduction - * cub::DeviceReduce::Max( - * d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); - * - * // d_out <-- [9] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced - * aggregate \iterator - * - * @tparam NumItemsT **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes a device-wide maximum using the greater-than (``>``) operator. + //! + //! - Uses ``std::numeric_limits::lowest()`` as the initial value of the reduction. + //! - Does not support ``>`` operators that are non-commutative. + //! - Provides "run-to-run" determinism for pseudo-associative reduction + //! (e.g., addition of floating point types) on the same GPU device. + //! However, results for pseudo-associative reduction may be inconsistent + //! from one device to a another device of a different compute-capability + //! because CUB can employ different tile-sizing for different architectures. + //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [-] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run max-reduction + //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + //! + //! // d_out <-- [9] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -813,87 +782,88 @@ struct DeviceReduce stream); } - /** - * @brief Finds the first device-wide maximum using the greater-than ('>') - * operator, also returning the index of that item - * - * @par - * - The output value type of `d_out` is cub::KeyValuePair `` - * (assuming the value type of `d_in` is `T`) - * - The maximum is written to `d_out.value` and its offset in the input - * array is written to `d_out.key`. - * - The `{1, std::numeric_limits::lowest()}` tuple is produced for - * zero-length inputs - * - Does not support `>` operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector - * of `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ArgMax( - * d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmax-reduction - * cub::DeviceReduce::ArgMax( - * d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); - * - * // d_out <-- [{6, 9}] - * - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input items - * (of some type \p T) \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate - * (having value type `cub::KeyValuePair`) \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Finds the first device-wide maximum using the greater-than (``>``) + //! operator, also returning the index of that item + //! + //! - The output value type of ``d_out`` is ``cub::KeyValuePair`` + //! (assuming the value type of ``d_in`` is ``T``) + //! + //! - The maximum is written to ``d_out.value`` and its offset in the input + //! array is written to ``d_out.key``. + //! - The ``{1, std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs + //! + //! - Does not support ``>`` operators that are non-commutative. + //! - Provides "run-to-run" determinism for pseudo-associative reduction + //! (e.g., addition of floating point types) on the same GPU device. + //! However, results for pseudo-associative reduction may be inconsistent + //! from one device to a another device of a different compute-capability + //! because CUB can employ different tile-sizing for different architectures. + //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the argmax-reduction of a device vector + //! of `int` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! KeyValuePair *d_out; // e.g., [{-,-}] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceReduce::ArgMax( + //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run argmax-reduction + //! cub::DeviceReduce::ArgMax( + //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + //! + //! // d_out <-- [{6, 9}] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate + //! (having value type `cub::KeyValuePair`) @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, @@ -966,114 +936,111 @@ struct DeviceReduce stream); } - /** - * @brief Fuses transform and reduce operations - * - * @par - * - Does not support binary reduction operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates a user-defined min-reduction of a - * device vector of `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * thrust::device_vector in = { 1, 2, 3, 4 }; - * thrust::device_vector out(1); - * - * std::size_t temp_storage_bytes = 0; - * std::uint8_t *d_temp_storage = nullptr; - * - * const int init = 42; - * - * cub::DeviceReduce::TransformReduce( - * d_temp_storage, - * temp_storage_bytes, - * in.begin(), - * out.begin(), - * in.size(), - * cub::Sum{}, - * square_t{}, - * init); - * - * thrust::device_vector temp_storage(temp_storage_bytes); - * d_temp_storage = temp_storage.data().get(); - * - * cub::DeviceReduce::TransformReduce( - * d_temp_storage, - * temp_storage_bytes, - * in.begin(), - * out.begin(), - * in.size(), - * cub::Sum{}, - * square_t{}, - * init); - * - * // out[0] <-- 72 - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced - * aggregate \iterator - * - * @tparam ReductionOpT - * **[inferred]** Binary reduction functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam TransformOpT - * **[inferred]** Unary reduction functor type having member - * `auto operator()(const T &a)` - * - * @tparam T - * **[inferred]** Data element type that is convertible to the `value` type - * of `InputIteratorT` - * - * @tparam NumItemsT - * **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_in`) - * - * @param[in] reduction_op - * Binary reduction functor - * - * @param[in] transform_op - * Unary transform functor - * - * @param[in] init - * Initial value of the reduction - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Fuses transform and reduce operations + //! + //! - Does not support binary reduction operators that are non-commutative. + //! - Provides "run-to-run" determinism for pseudo-associative reduction + //! (e.g., addition of floating point types) on the same GPU device. + //! However, results for pseudo-associative reduction may be inconsistent + //! from one device to a another device of a different compute-capability + //! because CUB can employ different tile-sizing for different architectures. + //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates a user-defined min-reduction of a + //! device vector of `int` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! thrust::device_vector in = { 1, 2, 3, 4 }; + //! thrust::device_vector out(1); + //! + //! std::size_t temp_storage_bytes = 0; + //! std::uint8_t *d_temp_storage = nullptr; + //! + //! const int init = 42; + //! + //! cub::DeviceReduce::TransformReduce( + //! d_temp_storage, + //! temp_storage_bytes, + //! in.begin(), + //! out.begin(), + //! in.size(), + //! cub::Sum{}, + //! square_t{}, + //! init); + //! + //! thrust::device_vector temp_storage(temp_storage_bytes); + //! d_temp_storage = temp_storage.data().get(); + //! + //! cub::DeviceReduce::TransformReduce( + //! d_temp_storage, + //! temp_storage_bytes, + //! in.begin(), + //! out.begin(), + //! in.size(), + //! cub::Sum{}, + //! square_t{}, + //! init); + //! + //! // out[0] <-- 72 + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator + //! + //! @tparam ReductionOpT + //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam TransformOpT + //! **[inferred]** Unary reduction functor type having member `auto operator()(const T &a)` + //! + //! @tparam T + //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT` + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] reduction_op + //! Binary reduction functor + //! + //! @param[in] transform_op + //! Unary transform functor + //! + //! @param[in] init + //! Initial value of the reduction + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template th run - * encountered, the first key of the run and the corresponding value - * aggregate of that run are written to `d_unique_out[i] and - * `d_aggregates_out[i]`, respectively. The total number of runs encountered - * is written to `d_num_runs_out`. - * - * @par - * - The `==` equality operator is used to determine whether keys are - * equivalent - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - Let `out` be any of - * `[d_unique_out, d_unique_out + *d_num_runs_out)` - * `[d_aggregates_out, d_aggregates_out + *d_num_runs_out)` - * `d_num_runs_out`. The ranges represented by `out` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, - * `[d_values_in, d_values_in + num_items)` nor `out` in any way. - * - @devicestorage - * - * @par Performance - * The following chart illustrates reduction-by-key (sum) performance across - * different CUDA architectures for `fp32` and `fp64` values, respectively. - * Segments are identified by `int32` keys, and have lengths uniformly - * sampled from `[1, 1000]`. - * - * @image html reduce_by_key_fp32_len_500.png - * @image html reduce_by_key_fp64_len_500.png - * - * @par - * The following charts are similar, but with segment lengths uniformly - * sampled from [1,10]: - * - * @image html reduce_by_key_fp32_len_5.png - * @image html reduce_by_key_fp64_len_5.png - * - * @par Snippet - * The code snippet below illustrates the segmented reduction of `int` values - * grouped by runs of associated `int` keys. - * @par - * @code - * #include - * // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 8 - * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] - * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] - * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] - * int *d_num_runs_out; // e.g., [-] - * CustomMin reduction_op; - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ReduceByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_unique_out, d_values_in, - * d_aggregates_out, d_num_runs_out, reduction_op, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduce-by-key - * cub::DeviceReduce::ReduceByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_unique_out, d_values_in, - * d_aggregates_out, d_num_runs_out, reduction_op, num_items); - * - * // d_unique_out <-- [0, 2, 9, 5, 8] - * // d_aggregates_out <-- [0, 1, 6, 2, 4] - * // d_num_runs_out <-- [5] - * @endcode - * - * @tparam KeysInputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * keys \iterator - * - * @tparam UniqueOutputIteratorT - * **[inferred]** Random-access output iterator type for writing unique - * output keys \iterator - * - * @tparam ValuesInputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * values \iterator - * - * @tparam AggregatesOutputIterator - * **[inferred]** Random-access output iterator type for writing output - * value aggregates \iterator - * - * @tparam NumRunsOutputIteratorT - * **[inferred]** Output iterator type for recording the number of runs - * encountered \iterator - * - * @tparam ReductionOpT - * **[inferred]*8 Binary reduction functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam NumItemsT **[inferred]** Type of num_items - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Pointer to the input sequence of keys - * - * @param[out] d_unique_out - * Pointer to the output sequence of unique keys (one key per run) - * - * @param[in] d_values_in - * Pointer to the input sequence of corresponding values - * - * @param[out] d_aggregates_out - * Pointer to the output sequence of value aggregates - * (one aggregate per run) - * - * @param[out] d_num_runs_out - * Pointer to total number of runs encountered - * (i.e., the length of `d_unique_out`) - * - * @param[in] reduction_op - * Binary reduction functor - * - * @param[in] num_items - * Total number of associated key+value pairs - * (i.e., the length of `d_in_keys` and `d_in_values`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. + //! + //! This operation computes segmented reductions within ``d_values_in`` using the specified binary ``reduction_op`` + //! functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal + //! ranges of consecutive, identical keys. For the *i*\ :sup:`th` run encountered, the first key of the run and + //! the corresponding value aggregate of that run are written to ``d_unique_out[i]`` and ``d_aggregates_out[i]``, + //! respectively. The total number of runs encountered is written to ``d_num_runs_out``. + //! + //! - The ``==`` equality operator is used to determine whether keys are equivalent + //! - Provides "run-to-run" determinism for pseudo-associative reduction + //! (e.g., addition of floating point types) on the same GPU device. + //! However, results for pseudo-associative reduction may be inconsistent + //! from one device to a another device of a different compute-capability + //! because CUB can employ different tile-sizing for different architectures. + //! - Let ``out`` be any of + //! ``[d_unique_out, d_unique_out + *d_num_runs_out)`` + //! ``[d_aggregates_out, d_aggregates_out + *d_num_runs_out)`` + //! ``d_num_runs_out``. The ranges represented by ``out`` shall not overlap + //! ``[d_keys_in, d_keys_in + num_items)``, + //! ``[d_values_in, d_values_in + num_items)`` nor ``out`` in any way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the segmented reduction of ``int`` values grouped by runs of + //! associated ``int`` keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 8 + //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + //! int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] + //! int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] + //! int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] + //! int *d_num_runs_out; // e.g., [-] + //! CustomMin reduction_op; + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceReduce::ReduceByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_unique_out, d_values_in, + //! d_aggregates_out, d_num_runs_out, reduction_op, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run reduce-by-key + //! cub::DeviceReduce::ReduceByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_unique_out, d_values_in, + //! d_aggregates_out, d_num_runs_out, reduction_op, num_items); + //! + //! // d_unique_out <-- [0, 2, 9, 5, 8] + //! // d_aggregates_out <-- [0, 1, 6, 2, 4] + //! // d_num_runs_out <-- [5] + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam UniqueOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing unique output keys @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam AggregatesOutputIterator + //! **[inferred]** Random-access output iterator type for writing output value aggregates @iterator + //! + //! @tparam NumRunsOutputIteratorT + //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator + //! + //! @tparam ReductionOpT + //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Pointer to the input sequence of keys + //! + //! @param[out] d_unique_out + //! Pointer to the output sequence of unique keys (one key per run) + //! + //! @param[in] d_values_in + //! Pointer to the input sequence of corresponding values + //! + //! @param[out] d_aggregates_out + //! Pointer to the output sequence of value aggregates + //! (one aggregate per run) + //! + //! @param[out] d_num_runs_out + //! Pointer to total number of runs encountered + //! (i.e., the length of `d_unique_out`) + //! + //! @param[in] reduction_op + //! Binary reduction functor + //! + //! @param[in] num_items + //! Total number of associated key+value pairs + //! (i.e., the length of `d_in_keys` and `d_in_values`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template *run-length encoding* - * computes a simple compressed representation of a sequence of input elements - * such that each maximal "run" of consecutive same-valued data items is - * encoded as a single data value along with a count of the elements in that - * run. - * - * @par Usage Considerations - * @cdp_class{DeviceRunLengthEncode} - * - * @par Performance - * @linear_performance{run-length encode} - * - * @par - * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode - * performance across different CUDA architectures for `int32` items. - * Segments have lengths uniformly sampled from `[1, 1000]`. - * - * @image html rle_int32_len_500.png - * - * @par - * @plots_below - */ +//! @rst +//! DeviceRunLengthEncode provides device-wide, parallel operations for +//! demarcating "runs" of same-valued items within a sequence residing +//! within device-accessible memory. +//! +//! Overview +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! A `run-length encoding `_ +//! computes a simple compressed representation of a sequence of input elements +//! such that each maximal "run" of consecutive same-valued data items is +//! encoded as a single data value along with a count of the elements in that +//! run. +//! +//! Usage Considerations +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! @cdp_class{DeviceRunLengthEncode} +//! +//! Performance +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! @linear_performance{run-length encode} +//! +//! @endrst struct DeviceRunLengthEncode { - /** - * @brief Computes a run-length encoding of the sequence \p d_in. - * - * @par - * - For the *i*th run encountered, the first key of the run and - * its length are written to `d_unique_out[i]` and `d_counts_out[i]`, - * respectively. - * - The total number of runs encountered is written to `d_num_runs_out`. - * - The `==` equality operator is used to determine whether values are - * equivalent - * - In-place operations are not supported. There must be no overlap between - * any of the provided ranges: - * - `[d_unique_out, d_unique_out + *d_num_runs_out)` - * - `[d_counts_out, d_counts_out + *d_num_runs_out)` - * - `[d_num_runs_out, d_num_runs_out + 1)` - * - `[d_in, d_in + num_items)` - * - @devicestorage - * - * @par Performance - * The following charts illustrate saturated encode performance across - * different CUDA architectures for `int32` and `int64` items, respectively. - * Segments have lengths uniformly sampled from [1,1000]. - * - * @image html rle_int32_len_500.png - * @image html rle_int64_len_500.png - * - * @par - * The following charts are similar, but with segment lengths uniformly - * sampled from [1,10]: - * - * @image html rle_int32_len_5.png - * @image html rle_int64_len_5.png - * - * @par Snippet - * The code snippet below illustrates the run-length encoding of a sequence - * of `int` values. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_unique_out; // e.g., [ , , , , , , , ] - * int *d_counts_out; // e.g., [ , , , , , , , ] - * int *d_num_runs_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRunLengthEncode::Encode( - * d_temp_storage, temp_storage_bytes, - * d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run encoding - * cub::DeviceRunLengthEncode::Encode( - * d_temp_storage, temp_storage_bytes, - * d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); - * - * // d_unique_out <-- [0, 2, 9, 5, 8] - * // d_counts_out <-- [1, 2, 1, 3, 1] - * // d_num_runs_out <-- [5] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam UniqueOutputIteratorT - * **[inferred]** Random-access output iterator type for writing unique - * output items \iterator - * - * @tparam LengthsOutputIteratorT - * **[inferred]** Random-access output iterator type for writing output - * counts \iterator - * - * @tparam NumRunsOutputIteratorT - * **[inferred]** Output iterator type for recording the number of runs - * encountered \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of keys - * - * @param[out] d_unique_out - * Pointer to the output sequence of unique keys (one key per run) - * - * @param[out] d_counts_out - * Pointer to the output sequence of run-lengths (one count per run) - * - * @param[out] d_num_runs_out - * Pointer to total number of runs - * - * @param[in] num_items - * Total number of associated key+value pairs (i.e., the length of - * `d_in_keys` and `d_in_values`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes a run-length encoding of the sequence ``d_in``. + //! + //! - For the *i*\ :sup:`th` run encountered, the first key of the run and + //! its length are written to ``d_unique_out[i]`` and ``d_counts_out[i]``, respectively. + //! - The total number of runs encountered is written to ``d_num_runs_out``. + //! - The ``==`` equality operator is used to determine whether values are equivalent + //! - In-place operations are not supported. There must be no overlap between any of the provided ranges: + //! + //! - ``[d_unique_out, d_unique_out + *d_num_runs_out)`` + //! - ``[d_counts_out, d_counts_out + *d_num_runs_out)`` + //! - ``[d_num_runs_out, d_num_runs_out + 1)`` + //! - ``[d_in, d_in + num_items)`` + //! + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the run-length encoding of a sequence of ``int`` values. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 8 + //! int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + //! int *d_unique_out; // e.g., [ , , , , , , , ] + //! int *d_counts_out; // e.g., [ , , , , , , , ] + //! int *d_num_runs_out; // e.g., [ ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceRunLengthEncode::Encode( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run encoding + //! cub::DeviceRunLengthEncode::Encode( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + //! + //! // d_unique_out <-- [0, 2, 9, 5, 8] + //! // d_counts_out <-- [1, 2, 1, 3, 1] + //! // d_num_runs_out <-- [5] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam UniqueOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing unique output items @iterator + //! + //! @tparam LengthsOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing output counts @iterator + //! + //! @tparam NumRunsOutputIteratorT + //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of keys + //! + //! @param[out] d_unique_out + //! Pointer to the output sequence of unique keys (one key per run) + //! + //! @param[out] d_counts_out + //! Pointer to the output sequence of run-lengths (one count per run) + //! + //! @param[out] d_num_runs_out + //! Pointer to total number of runs + //! + //! @param[in] num_items + //! Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template 1`) of same-valued keys in the sequence `d_in`. - * - * @par - * - For the *i*th non-trivial run, the run's starting offset and - * its length are written to `d_offsets_out[i]` and `d_lengths_out[i]`, - * respectively. - * - The total number of runs encountered is written to `d_num_runs_out`. - * - The `==` equality operator is used to determine whether values are - * equivalent - * - In-place operations are not supported. There must be no overlap between - * any of the provided ranges: - * - `[d_offsets_out, d_offsets_out + *d_num_runs_out)` - * - `[d_lengths_out, d_lengths_out + *d_num_runs_out)` - * - `[d_num_runs_out, d_num_runs_out + 1)` - * - `[d_in, d_in + num_items)` - * - @devicestorage - * - * @par Performance - * - * @par Snippet - * The code snippet below illustrates the identification of non-trivial runs - * within a sequence of `int` values. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_offsets_out; // e.g., [ , , , , , , , ] - * int *d_lengths_out; // e.g., [ , , , , , , , ] - * int *d_num_runs_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRunLengthEncode::NonTrivialRuns( - * d_temp_storage, temp_storage_bytes, - * d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run encoding - * cub::DeviceRunLengthEncode::NonTrivialRuns( - * d_temp_storage, temp_storage_bytes, - * d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); - * - * // d_offsets_out <-- [1, 4] - * // d_lengths_out <-- [2, 3] - * // d_num_runs_out <-- [2] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OffsetsOutputIteratorT - * **[inferred]** Random-access output iterator type for writing run-offset - * values \iterator - * - * @tparam LengthsOutputIteratorT - * **[inferred]** Random-access output iterator type for writing run-length - * values \iterator - * - * @tparam NumRunsOutputIteratorT - * **[inferred]** Output iterator type for recording the number of runs - * encountered \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to input sequence of data items - * - * @param[out] d_offsets_out - * Pointer to output sequence of run-offsets - * (one offset per non-trivial run) - * - * @param[out] d_lengths_out - * Pointer to output sequence of run-lengths - * (one count per non-trivial run) - * - * @param[out] d_num_runs_out - * Pointer to total number of runs (i.e., length of `d_offsets_out`) - * - * @param[in] num_items - * Total number of associated key+value pairs (i.e., the length of - * `d_in_keys` and `d_in_values`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Enumerates the starting offsets and lengths of all non-trivial runs + //! (of ``length > 1``) of same-valued keys in the sequence ``d_in``. + //! + //! - For the *i*\ :sup:`th` non-trivial run, the run's starting offset and + //! its length are written to ``d_offsets_out[i]`` and ``d_lengths_out[i]``, respectively. + //! - The total number of runs encountered is written to ``d_num_runs_out``. + //! - The ``==`` equality operator is used to determine whether values are equivalent + //! - In-place operations are not supported. There must be no overlap between any of the provided ranges: + //! + //! - ``[d_offsets_out, d_offsets_out + *d_num_runs_out)`` + //! - ``[d_lengths_out, d_lengths_out + *d_num_runs_out)`` + //! - ``[d_num_runs_out, d_num_runs_out + 1)`` + //! - ``[d_in, d_in + num_items)`` + //! + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the identification of non-trivial runs + //! within a sequence of ``int`` values. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 8 + //! int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + //! int *d_offsets_out; // e.g., [ , , , , , , , ] + //! int *d_lengths_out; // e.g., [ , , , , , , , ] + //! int *d_num_runs_out; // e.g., [ ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceRunLengthEncode::NonTrivialRuns( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run encoding + //! cub::DeviceRunLengthEncode::NonTrivialRuns( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + //! + //! // d_offsets_out <-- [1, 4] + //! // d_lengths_out <-- [2, 3] + //! // d_num_runs_out <-- [2] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OffsetsOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing run-offset values @iterator + //! + //! @tparam LengthsOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing run-length values @iterator + //! + //! @tparam NumRunsOutputIteratorT + //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to input sequence of data items + //! + //! @param[out] d_offsets_out + //! Pointer to output sequence of run-offsets + //! (one offset per non-trivial run) + //! + //! @param[out] d_lengths_out + //! Pointer to output sequence of run-lengths (one count per non-trivial run) + //! + //! @param[out] d_num_runs_out + //! Pointer to total number of runs (i.e., length of `d_offsets_out`) + //! + //! @param[in] num_items + //! Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template th output reduction incorporates the *i*th input. - * The term *exclusive* indicates the *i*th input is not - * incorporated into the *i*th output reduction. When the input and - * output sequences are the same, the scan is performed in-place. - * - * @par - * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our - * *"decoupled look-back"* algorithm for performing global prefix scan with - * only a single pass through the input data, as described in our 2016 technical - * report [1]. The central idea is to leverage a small, constant factor of - * redundant work in order to overlap the latencies of global prefix - * propagation with local computation. As such, our algorithm requires only - * ~2*n* data movement (*n* inputs are read, *n* outputs are written), and - * typically proceeds at "memcpy" speeds. Our algorithm supports inplace - * operations. - * - * @par - * [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", NVIDIA Technical Report NVR-2016-002, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back) - * - * @par Usage Considerations - * @cdp_class{DeviceScan} - * - * @par Performance - * @linear_performance{prefix scan} - * - * @par - * The following chart illustrates DeviceScan::ExclusiveSum performance across - * different CUDA architectures for `int32` keys. - * @plots_below - * - * @image html scan_int32.png - * - */ +//! @rst +//! DeviceScan provides device-wide, parallel operations for computing a +//! prefix scan across a sequence of data items residing within +//! device-accessible memory. +//! +//! Overview +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! Given a sequence of input elements and a binary reduction operator, a +//! `prefix scan `_ produces an output +//! sequence where each element is computed to be the reduction of the elements +//! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan +//! with the addition operator. The term *inclusive* indicates that the +//! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input. +//! The term *exclusive* indicates the *i*\ :sup:`th` input is not +//! incorporated into the *i*\ :sup:`th` output reduction. When the input and +//! output sequences are the same, the scan is performed in-place. +//! +//! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our +//! *"decoupled look-back"* algorithm for performing global prefix scan with +//! only a single pass through the input data, as described in our 2016 technical +//! report [1]_. The central idea is to leverage a small, constant factor of +//! redundant work in order to overlap the latencies of global prefix +//! propagation with local computation. As such, our algorithm requires only +//! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and +//! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations. +//! +//! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back +//! `_, +//! *NVIDIA Technical Report NVR-2016-002*, 2016. +//! +//! Usage Considerations +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! @cdp_class{DeviceScan} +//! +//! Performance +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! @linear_performance{prefix scan} +//! +//! @endrst struct DeviceScan { - /******************************************************************//** - * \name Exclusive scans - *********************************************************************/ - //@{ - - /** - * @brief Computes a device-wide exclusive prefix sum. The value of `0` is - * applied as the initial value, and is assigned to `*d_out`. - * - * @par - * - Supports non-commutative sum operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - When `d_in` and `d_out` are equal, the scan is performed in-place. The - * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` - * shall not overlap in any other way. - * - @devicestorage - * - * @par Performance - * The following charts illustrate saturated exclusive sum performance across - * different CUDA architectures for `int32` and `int64` items, respectively. - * - * @image html scan_int32.png - * @image html scan_int64.png - * - * @par Snippet - * The code snippet below illustrates the exclusive prefix sum of an `int` - * device vector. - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveSum( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix sum - * cub::DeviceScan::ExclusiveSum( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, num_items); - * - * // d_out <-- [0, 8, 14, 21, 26, 29, 29] - * - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan - * inputs \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan - * outputs \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Random-access iterator to the input sequence of data items - * - * @param[out] d_out - * Random-access iterator to the output sequence of data items - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @name Exclusive scans + //! @{ + + + //! @rst + //! Computes a device-wide exclusive prefix sum. + //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. + //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` + //! shall not overlap in any other way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the exclusive prefix sum of an ``int`` + //! device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [ , , , , , , ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::ExclusiveSum( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run exclusive prefix sum + //! cub::DeviceScan::ExclusiveSum( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, num_items); + //! + //! // d_out <-- [0, 8, 14, 21, 26, 29, 29] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Random-access iterator to the input sequence of data items + //! + //! @param[out] d_out + //! Random-access iterator to the output sequence of data items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(void *d_temp_storage, @@ -235,81 +216,72 @@ struct DeviceScan stream); } - /** - * @brief Computes a device-wide exclusive prefix sum in-place. The value of - * `0` is applied as the initial value, and is assigned to `*d_data`. - * - * @par - * - Supports non-commutative sum operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - @devicestorage - * - * @par Performance - * The following charts illustrate saturated exclusive sum performance across - * different CUDA architectures for `int32` and `int64` items, respectively. - * - * @image html scan_int32.png - * @image html scan_int64.png - * - * @par Snippet - * The code snippet below illustrates the exclusive prefix sum of an `int` - * device vector. - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveSum( - * d_temp_storage, temp_storage_bytes, - * d_data, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix sum - * cub::DeviceScan::ExclusiveSum( - * d_temp_storage, temp_storage_bytes, - * d_data, num_items); - * - * // d_data <-- [0, 8, 14, 21, 26, 29, 29] - * - * @endcode - * - * @tparam IteratorT - * **[inferred]** Random-access iterator type for reading scan - * inputs and wrigin scan outputs - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_data - * Random-access iterator to the sequence of data items - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide exclusive prefix sum in-place. + //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the exclusive prefix sum of an ``int`` + //! device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::ExclusiveSum( + //! d_temp_storage, temp_storage_bytes, + //! d_data, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run exclusive prefix sum + //! cub::DeviceScan::ExclusiveSum( + //! d_temp_storage, temp_storage_bytes, + //! d_data, num_items); + //! + //! // d_data <-- [0, 8, 14, 21, 26, 29, 29] + //! + //! @endrst + //! + //! @tparam IteratorT + //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_data + //! Random-access iterator to the sequence of data items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(void *d_temp_storage, @@ -345,113 +317,108 @@ struct DeviceScan stream); } - /** - * @brief Computes a device-wide exclusive prefix scan using the specified - * binary `scan_op` functor. The `init_value` value is applied as - * the initial value, and is assigned to `*d_out`. - * - * @par - * - Supports non-commutative scan operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - When `d_in` and `d_out` are equal, the scan is performed in-place. The - * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` - * shall not overlap in any other way. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an - * `int` device vector - * @par - * @code - * #include // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * ... - * - * // Determine temporary device storage requirements for exclusive - * // prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, min_op, (int) INT_MAX, num_items); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, min_op, (int) INT_MAX, num_items); - * - * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] - * - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan - * inputs \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan - * outputs \iterator - * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam InitValueT - * **[inferred]** Type of the `init_value` used Binary scan functor type - * having member `T operator()(const T &a, const T &b)` - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Random-access iterator to the input sequence of data items - * - * @param[out] d_out - * Random-access iterator to the output sequence of data items - * - * @param[in] scan_op - * Binary scan functor - * - * @param[in] init_value - * Initial value to seed the exclusive scan (and is assigned to *d_out) - * - * @param[in] num_items - * Total number of input items (i.e., the length of \p d_in) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. Default is - * stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide exclusive prefix scan using the specified + //! binary ``scan_op`` functor. The ``init_value`` value is applied as + //! the initial value, and is assigned to ``*d_out``. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The + //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` + //! shall not overlap in any other way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! #include // for INT_MAX + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [ , , , , , , ] + //! CustomMin min_op; + //! ... + //! + //! // Determine temporary device storage requirements for exclusive + //! // prefix scan + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::ExclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, min_op, (int) INT_MAX, num_items); + //! + //! // Allocate temporary storage for exclusive prefix scan + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run exclusive prefix min-scan + //! cub::DeviceScan::ExclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, min_op, (int) INT_MAX, num_items); + //! + //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam InitValueT + //! **[inferred]** Type of the `init_value` used Binary scan functor type + //! having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Random-access iterator to the input sequence of data items + //! + //! @param[out] d_out + //! Random-access iterator to the output sequence of data items + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[in] init_value + //! Initial value to seed the exclusive scan (and is assigned to `*d_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] - * CustomMin min_op; - * ... - * - * // Determine temporary device storage requirements for exclusive - * // prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_data, min_op, (int) INT_MAX, num_items); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_data, min_op, (int) INT_MAX, num_items); - * - * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] - * - * @endcode - * - * @tparam IteratorT - * **[inferred]** Random-access input iterator type for reading scan - * inputs and writing scan outputs - * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam InitValueT - * **[inferred]** Type of the `init_value` used Binary scan functor type - * having member `T operator()(const T &a, const T &b)` - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_data - * Random-access iterator to the sequence of data items - * - * @param[in] scan_op - * Binary scan functor - * - * @param[in] init_value - * Initial value to seed the exclusive scan (and is assigned to *d_out) - * - * @param[in] num_items - * Total number of input items (i.e., the length of \p d_in) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. Default is - * stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide exclusive prefix scan using the specified + //! binary ``scan_op`` functor. The ``init_value`` value is applied as + //! the initial value, and is assigned to ``*d_data``. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the exclusive prefix min-scan of an + //! ``int`` device vector: + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! #include // for INT_MAX + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! CustomMin min_op; + //! ... + //! + //! // Determine temporary device storage requirements for exclusive + //! // prefix scan + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::ExclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_data, min_op, (int) INT_MAX, num_items); + //! + //! // Allocate temporary storage for exclusive prefix scan + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run exclusive prefix min-scan + //! cub::DeviceScan::ExclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_data, min_op, (int) INT_MAX, num_items); + //! + //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam IteratorT + //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam InitValueT + //! **[inferred]** Type of the `init_value` used Binary scan functor type + //! having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_data + //! Random-access iterator to the sequence of data items + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[in] init_value + //! Initial value to seed the exclusive scan (and is assigned to `*d_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -656,118 +620,112 @@ struct DeviceScan stream); } - /** - * @brief Computes a device-wide exclusive prefix scan using the specified - * binary `scan_op` functor. The `init_value` value is provided as - * a future value. - * - * @par - * - Supports non-commutative scan operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - When `d_in` and `d_out` are equal, the scan is performed in-place. The - * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` - * shall not overlap in any other way. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an - * `int` device vector - * @par - * @code - * #include // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * int *d_init_iter; // e.g., INT_MAX - * CustomMin min_op; - * - * auto future_init_value = - * cub::FutureValue(d_init_iter); - * - * ... - * - * // Determine temporary device storage requirements for exclusive - * // prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, min_op, future_init_value, num_items); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, min_op, future_init_value, num_items); - * - * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] - * - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan - * inputs \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan - * outputs \iterator - * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam InitValueT - * **[inferred]** Type of the `init_value` used Binary scan functor type - * having member `T operator()(const T &a, const T &b)` - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of \p d_temp_storage allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output sequence of data items - * - * @param[in] scan_op - * Binary scan functor - * - * @param[in] init_value - * Initial value to seed the exclusive scan (and is assigned to `*d_out`) - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide exclusive prefix scan using the specified + //! binary ``scan_op`` functor. The ``init_value`` value is provided as a future value. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. + //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` + //! shall not overlap in any other way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! #include // for INT_MAX + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [ , , , , , , ] + //! int *d_init_iter; // e.g., INT_MAX + //! CustomMin min_op; + //! + //! auto future_init_value = + //! cub::FutureValue(d_init_iter); + //! + //! ... + //! + //! // Determine temporary device storage requirements for exclusive + //! // prefix scan + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::ExclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, min_op, future_init_value, num_items); + //! + //! // Allocate temporary storage for exclusive prefix scan + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run exclusive prefix min-scan + //! cub::DeviceScan::ExclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, min_op, future_init_value, num_items); + //! + //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam InitValueT + //! **[inferred]** Type of the `init_value` used Binary scan functor type + //! having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output sequence of data items + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[in] init_value + //! Initial value to seed the exclusive scan (and is assigned to `*d_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_init_iter; // e.g., INT_MAX - * CustomMin min_op; - * - * auto future_init_value = - * cub::FutureValue(d_init_iter); - * - * ... - * - * // Determine temporary device storage requirements for exclusive - * // prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_data, min_op, future_init_value, num_items); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_data, min_op, future_init_value, num_items); - * - * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] - * - * @endcode - * - * @tparam IteratorT - * **[inferred]** Random-access input iterator type for reading scan - * inputs and writing scan outputs - * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam InitValueT - * **[inferred]** Type of the `init_value` used Binary scan functor type - * having member `T operator()(const T &a, const T &b)` - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of \p d_temp_storage allocation - * - * @param[in,out] d_data - * Pointer to the sequence of data items - * - * @param[in] scan_op - * Binary scan functor - * - * @param[in] init_value - * Initial value to seed the exclusive scan (and is assigned to `*d_out`) - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor. + //! The ``init_value`` value is provided as a future value. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! #include // for INT_MAX + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_init_iter; // e.g., INT_MAX + //! CustomMin min_op; + //! + //! auto future_init_value = + //! cub::FutureValue(d_init_iter); + //! + //! ... + //! + //! // Determine temporary device storage requirements for exclusive + //! // prefix scan + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::ExclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_data, min_op, future_init_value, num_items); + //! + //! // Allocate temporary storage for exclusive prefix scan + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run exclusive prefix min-scan + //! cub::DeviceScan::ExclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_data, min_op, future_init_value, num_items); + //! + //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam IteratorT + //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam InitValueT + //! **[inferred]** Type of the `init_value` used Binary scan functor type + //! having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_data + //! Pointer to the sequence of data items + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[in] init_value + //! Initial value to seed the exclusive scan (and is assigned to `*d_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements for inclusive - * // prefix sum - * void *d_temp_storage = nullptr; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveSum( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, num_items); - * - * // Allocate temporary storage for inclusive prefix sum - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix sum - * cub::DeviceScan::InclusiveSum( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, num_items); - * - * // d_out <-- [8, 14, 21, 26, 29, 29, 38] - * - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan - * inputs \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan - * outputs \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Random-access iterator to the input sequence of data items - * - * @param[out] d_out - * Random-access iterator to the output sequence of data items - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @} end member group + //! @name Inclusive scans + //! @{ + + //! @rst + //! Computes a device-wide inclusive prefix sum. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The + //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` + //! shall not overlap in any other way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [ , , , , , , ] + //! ... + //! + //! // Determine temporary device storage requirements for inclusive + //! // prefix sum + //! void *d_temp_storage = nullptr; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::InclusiveSum( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, num_items); + //! + //! // Allocate temporary storage for inclusive prefix sum + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run inclusive prefix sum + //! cub::DeviceScan::InclusiveSum( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, num_items); + //! + //! // d_out <-- [8, 14, 21, 26, 29, 29, 38] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Random-access iterator to the input sequence of data items + //! + //! @param[out] d_out + //! Random-access iterator to the output sequence of data items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(void *d_temp_storage, @@ -1119,75 +1064,71 @@ struct DeviceScan stream); } - /** - * @brief Computes a device-wide inclusive prefix sum in-place. - * - * @par - * - Supports non-commutative sum operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the inclusive prefix sum of an `int` - * device vector. - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] - * ... - * - * // Determine temporary device storage requirements for inclusive - * // prefix sum - * void *d_temp_storage = nullptr; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveSum( - * d_temp_storage, temp_storage_bytes, - * d_data, num_items); - * - * // Allocate temporary storage for inclusive prefix sum - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix sum - * cub::DeviceScan::InclusiveSum( - * d_temp_storage, temp_storage_bytes, - * d_data, num_items); - * - * // d_data <-- [8, 14, 21, 26, 29, 29, 38] - * - * @endcode - * - * @tparam IteratorT - * **[inferred]** Random-access input iterator type for reading scan - * inputs and writing scan outputs - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_data - * Random-access iterator to the sequence of data items - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide inclusive prefix sum in-place. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! ... + //! + //! // Determine temporary device storage requirements for inclusive + //! // prefix sum + //! void *d_temp_storage = nullptr; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::InclusiveSum( + //! d_temp_storage, temp_storage_bytes, + //! d_data, num_items); + //! + //! // Allocate temporary storage for inclusive prefix sum + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run inclusive prefix sum + //! cub::DeviceScan::InclusiveSum( + //! d_temp_storage, temp_storage_bytes, + //! d_data, num_items); + //! + //! // d_data <-- [8, 14, 21, 26, 29, 29, 38] + //! + //! @endrst + //! + //! @tparam IteratorT + //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_data + //! Random-access iterator to the sequence of data items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(void *d_temp_storage, @@ -1223,106 +1164,100 @@ struct DeviceScan stream); } - /** - * @brief Computes a device-wide inclusive prefix scan using the specified - * binary `scan_op` functor. - * - * @par - * - Supports non-commutative scan operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - When `d_in` and `d_out` are equal, the scan is performed in-place. The - * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` - * shall not overlap in any other way. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the inclusive prefix min-scan of an - * `int` device vector. - * - * @par - * @code - * #include // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * ... - * - * // Determine temporary device storage requirements for inclusive - * // prefix scan - * void *d_temp_storage = nullptr; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, min_op, num_items); - * - * // Allocate temporary storage for inclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix min-scan - * cub::DeviceScan::InclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, min_op, num_items); - * - * // d_out <-- [8, 6, 6, 5, 3, 0, 0] - * - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan - * inputs \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan - * outputs \iterator - * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member - * `T operator()(const T &a, const T &b)` - * - * @param[in] - * d_temp_storage Device-accessible allocation of temporary storage. - * When `nullptr`, the required allocation size is written to - * `temp_storage_bytes` and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Random-access iterator to the input sequence of data items - * - * @param[out] d_out - * Random-access iterator to the output sequence of data items - * - * @param[in] scan_op - * Binary scan functor - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The + //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` + //! shall not overlap in any other way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! #include // for INT_MAX + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [ , , , , , , ] + //! CustomMin min_op; + //! ... + //! + //! // Determine temporary device storage requirements for inclusive + //! // prefix scan + //! void *d_temp_storage = nullptr; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::InclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, min_op, num_items); + //! + //! // Allocate temporary storage for inclusive prefix scan + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run inclusive prefix min-scan + //! cub::DeviceScan::InclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, min_op, num_items); + //! + //! // d_out <-- [8, 6, 6, 5, 3, 0, 0] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] + //! d_temp_storage Device-accessible allocation of temporary storage. + //! When `nullptr`, the required allocation size is written to + //! `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Random-access iterator to the input sequence of data items + //! + //! @param[out] d_out + //! Random-access iterator to the output sequence of data items + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(void *d_temp_storage, @@ -1374,95 +1309,90 @@ struct DeviceScan stream); } - /** - * @brief Computes a device-wide inclusive prefix scan using the specified - * binary `scan_op` functor. - * - * @par - * - Supports non-commutative scan operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the inclusive prefix min-scan of an - * `int` device vector. - * - * @par - * @code - * #include // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] - * CustomMin min_op; - * ... - * - * // Determine temporary device storage requirements for inclusive - * // prefix scan - * void *d_temp_storage = nullptr; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_data, min_op, num_items); - * - * // Allocate temporary storage for inclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix min-scan - * cub::DeviceScan::InclusiveScan( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, min_op, num_items); - * - * // d_data <-- [8, 6, 6, 5, 3, 0, 0] - * - * @endcode - * - * @tparam IteratorT - * **[inferred]** Random-access input iterator type for reading scan - * inputs and writing scan outputs - * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member - * `T operator()(const T &a, const T &b)` - * - * @param[in] - * d_temp_storage Device-accessible allocation of temporary storage. - * When `nullptr`, the required allocation size is written to - * `temp_storage_bytes` and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_data - * Random-access iterator to the sequence of data items - * - * @param[in] scan_op - * Binary scan functor - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! #include // for INT_MAX + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! CustomMin min_op; + //! ... + //! + //! // Determine temporary device storage requirements for inclusive + //! // prefix scan + //! void *d_temp_storage = nullptr; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::InclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_data, min_op, num_items); + //! + //! // Allocate temporary storage for inclusive prefix scan + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run inclusive prefix min-scan + //! cub::DeviceScan::InclusiveScan( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, min_op, num_items); + //! + //! // d_data <-- [8, 6, 6, 5, 3, 0, 0] + //! + //! @endrst + //! + //! @tparam IteratorT + //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @param[in] + //! d_temp_storage Device-accessible allocation of temporary storage. + //! When `nullptr`, the required allocation size is written to + //! `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_data + //! Random-access iterator to the sequence of data items + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(void *d_temp_storage, @@ -1502,108 +1432,100 @@ struct DeviceScan stream); } - /** - * @brief Computes a device-wide exclusive prefix sum-by-key with key equality - * defined by `equality_op`. The value of `0` is applied as the initial - * value, and is assigned to the beginning of each segment in - * `d_values_out`. - * - * @par - * - Supports non-commutative sum operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - `d_keys_in` may equal `d_values_out` but the range - * `[d_keys_in, d_keys_in + num_items)` and the range - * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. - * - `d_values_in` may equal `d_values_out` but the range - * `[d_values_in, d_values_in + num_items)` and the range - * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the exclusive prefix sum-by-key of an - * `int` device vector. - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] - * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_values_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = nullptr; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveSumByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, d_values_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix sum - * cub::DeviceScan::ExclusiveSumByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, d_values_out, num_items); - * - * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0] - * - * @endcode - * - * @tparam KeysInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan keys - * inputs \iterator - * - * @tparam ValuesInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan - * values inputs \iterator - * - * @tparam ValuesOutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan - * values outputs \iterator - * - * @tparam EqualityOpT - * **[inferred]** Functor type having member - * `T operator()(const T &a, const T &b)` for binary operations that - * defines the equality of keys - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Random-access input iterator to the input sequence of key items - * - * @param[in] d_values_in - * Random-access input iterator to the input sequence of value items - * - * @param[out] d_values_out - * Random-access output iterator to the output sequence of value items - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_keys_in` and - * `d_values_in`) - * - * @param[in] equality_op - * Binary functor that defines the equality of keys. - * Default is cub::Equality(). - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide exclusive prefix sum-by-key with key equality + //! defined by ``equality_op``. The value of ``0`` is applied as the initial + //! value, and is assigned to the beginning of each segment in ``d_values_out``. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] + //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_values_out; // e.g., [ , , , , , , ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = nullptr; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::ExclusiveSumByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, d_values_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run exclusive prefix sum + //! cub::DeviceScan::ExclusiveSumByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, d_values_out, num_items); + //! + //! // d_values_out <-- [0, 8, 0, 7, 12, 0, 0] + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cub::Equality(). + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // CustomEqual functor - * struct CustomEqual - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return a == b; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] - * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_values_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * CustomEqual equality_op; - * ... - * - * // Determine temporary device storage requirements for exclusive - * // prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScanByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, d_values_out, min_op, - * (int) INT_MAX, num_items, equality_op); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScanByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, d_values_out, min_op, - * (int) INT_MAX, num_items, equality_op); - * - * // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0] - * - * @endcode - * - * @tparam KeysInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan keys - * inputs \iterator - * - * @tparam ValuesInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan values - * inputs \iterator - * - * @tparam ValuesOutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan values - * outputs \iterator - * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam InitValueT - * **[inferred]** Type of the `init_value` value used in Binary scan - * functor type having member `T operator()(const T &a, const T &b)` - * - * @tparam EqualityOpT - * **[inferred]** Functor type having member - * `T operator()(const T &a, const T &b)` for binary operations that - * defines the equality of keys - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Random-access input iterator to the input sequence of key items - * - * @param[in] d_values_in - * Random-access input iterator to the input sequence of value items - * - * @param[out] d_values_out - * Random-access output iterator to the output sequence of value items - * - * @param[in] scan_op - * Binary scan functor - * - * @param[in] init_value - * Initial value to seed the exclusive scan (and is assigned to the - * beginning of each segment in `d_values_out`) - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_keys_in` and - * `d_values_in`) - * - * @param[in] equality_op - * Binary functor that defines the equality of keys. - * Default is cub::Equality(). - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide exclusive prefix scan-by-key using the + //! specified binary ``scan_op`` functor. The key equality is defined by + //! ``equality_op``. The ``init_value`` value is applied as the initial + //! value, and is assigned to the beginning of each segment in ``d_values_out``. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! #include // for INT_MAX + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // CustomEqual functor + //! struct CustomEqual + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return a == b; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] + //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_values_out; // e.g., [ , , , , , , ] + //! CustomMin min_op; + //! CustomEqual equality_op; + //! ... + //! + //! // Determine temporary device storage requirements for exclusive + //! // prefix scan + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::ExclusiveScanByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, d_values_out, min_op, + //! (int) INT_MAX, num_items, equality_op); + //! + //! // Allocate temporary storage for exclusive prefix scan + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run exclusive prefix min-scan + //! cub::DeviceScan::ExclusiveScanByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, d_values_out, min_op, + //! (int) INT_MAX, num_items, equality_op); + //! + //! // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0] + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam InitValueT + //! **[inferred]** Type of the `init_value` value used in Binary scan + //! functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[in] init_value + //! Initial value to seed the exclusive scan (and is assigned to the + //! beginning of each segment in `d_values_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and + //! `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cub::Equality(). + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] - * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_values_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements for inclusive prefix sum - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveSumByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, d_values_out, num_items); - * - * // Allocate temporary storage for inclusive prefix sum - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix sum - * cub::DeviceScan::InclusiveSumByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, d_values_out, num_items); - * - * // d_out <-- [8, 14, 7, 12, 15, 0, 9] - * - * @endcode - * - * @tparam KeysInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan - * keys inputs \iterator - * - * @tparam ValuesInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan - * values inputs \iterator - * - * @tparam ValuesOutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan - * values outputs \iterator - * - * @tparam EqualityOpT - * **[inferred]** Functor type having member - * `T operator()(const T &a, const T &b)` for binary operations that - * defines the equality of keys - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. - * When `nullptr`, the required allocation size is written to - * `temp_storage_bytes` and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Random-access input iterator to the input sequence of key items - * - * @param[in] d_values_in - * Random-access input iterator to the input sequence of value items - * - * @param[out] d_values_out - * Random-access output iterator to the output sequence of value items - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_keys_in` and - * `d_values_in`) - * - * @param[in] equality_op - * Binary functor that defines the equality of keys. - * Default is cub::Equality(). - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] + //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_values_out; // e.g., [ , , , , , , ] + //! ... + //! + //! // Determine temporary device storage requirements for inclusive prefix sum + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::InclusiveSumByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, d_values_out, num_items); + //! + //! // Allocate temporary storage for inclusive prefix sum + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run inclusive prefix sum + //! cub::DeviceScan::InclusiveSumByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, d_values_out, num_items); + //! + //! // d_out <-- [8, 14, 7, 12, 15, 0, 9] + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. + //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cub::Equality(). + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // CustomEqual functor - * struct CustomEqual - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return a == b; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for - * // input and output - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] - * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_values_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * CustomEqual equality_op; - * ... - * - * // Determine temporary device storage requirements for inclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveScanByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); - * - * // Allocate temporary storage for inclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix min-scan - * cub::DeviceScan::InclusiveScanByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); - * - * // d_out <-- [8, 6, 7, 5, 3, 0, 0] - * - * @endcode - * - * @tparam KeysInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan keys - * inputs \iterator - * - * @tparam ValuesInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan - * values inputs \iterator - * - * @tparam ValuesOutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan - * values outputs \iterator - * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam EqualityOpT - * **[inferred]** Functor type having member - * `T operator()(const T &a, const T &b)` for binary operations that - * defines the equality of keys - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. - * When `nullptr`, the required allocation size is written to - * `temp_storage_bytes` and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Random-access input iterator to the input sequence of key items - * - * @param[in] d_values_in - * Random-access input iterator to the input sequence of value items - * - * @param[out] d_values_out - * Random-access output iterator to the output sequence of value items - * - * @param[in] scan_op - * Binary scan functor - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_keys_in` and - * `d_values_in`) - * - * @param[in] equality_op - * Binary functor that defines the equality of keys. - * Default is cub::Equality(). - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ + //! @rst + //! Computes a device-wide inclusive prefix scan-by-key using the + //! specified binary ``scan_op`` functor. The key equality is defined by ``equality_op``. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! #include // for INT_MAX + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // CustomEqual functor + //! struct CustomEqual + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return a == b; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // input and output + //! int num_items; // e.g., 7 + //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] + //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_values_out; // e.g., [ , , , , , , ] + //! CustomMin min_op; + //! CustomEqual equality_op; + //! ... + //! + //! // Determine temporary device storage requirements for inclusive prefix scan + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceScan::InclusiveScanByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); + //! + //! // Allocate temporary storage for inclusive prefix scan + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run inclusive prefix min-scan + //! cub::DeviceScan::InclusiveScanByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); + //! + //! // d_out <-- [8, 6, 7, 5, 3, 0, 0] + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam ScanOp + //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. + //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] scan_op + //! Binary scan functor + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cub::Equality(). + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template `_ +//! arranges items into ascending (or descending) order. The algorithm relies +//! upon a positional representation for keys, i.e., each key is comprised of an +//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from +//! least-significant to most-significant. For a given input sequence of keys +//! and a set of rules specifying a total ordering of the symbolic alphabet, the +//! radix sorting method produces a lexicographic ordering of those keys. +//! +//! See Also +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See +//! that algorithm's documentation for more information. +//! +//! Segments are not required to be contiguous. Any element of input(s) or +//! output(s) outside the specified segments will not be accessed nor modified. +//! +//! Usage Considerations +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! @cdp_class{DeviceSegmentedRadixSort} +//! +//! @endrst struct DeviceSegmentedRadixSort { - /******************************************************************//** - * @name Key-value pairs - *********************************************************************/ - //@{ + //! @name Key-value pairs + //! @{ - /** - * @brief Sorts segments of key-value pairs into ascending order. - * (`~2N` auxiliary storage required) - * - * @par - * - The contents of the input data are not altered by the sorting operation - * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased - * for both the `d_begin_offsets` and `d_end_offsets` parameters (where - * the latter is specified as `segment_offsets + 1`). - * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - * bits can be specified. This can reduce overall sorting overhead and - * yield a corresponding performance improvement. - * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of - * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall - * not overlap `[in, in + num_items)`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - @devicestorageNP For sorting using only `O(P)` temporary storage, see - * the sorting interface using DoubleBuffer wrappers below. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, - * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of `int` keys with associated vector of - * `int` values. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairs( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairs( - d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] - * @endcode - * - * @tparam KeyT - * **[inferred]** Key type - * - * @tparam ValueT - * **[inferred]** Value type - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] d_values_in - * Device-accessible pointer to the corresponding input sequence of - * associated value items - * - * @param[out] d_values_out - * Device-accessible pointer to the correspondingly-reordered output - * sequence of associated value items - * - * @param[in] num_items - * The total number of items within the segmented array, including items not - * covered by segments. `num_items` should match the largest element within - * the range `[d_end_offsets, d_end_offsets + num_segments)`. - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. If - * `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is - * considered empty. - * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for - * key comparison - * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key - * comparison (e.g., `sizeof(unsigned int) * 8`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into ascending order. (``~2N`` auxiliary storage required) + //! + //! - The contents of the input data are not altered by the sorting operation + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and + //! yield a corresponding performance improvement. + //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of + //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall + //! not overlap ``[in, in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see + //! the sorting interface using DoubleBuffer wrappers below. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, + //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedRadixSort::SortPairs( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedRadixSort::SortPairs( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] d_values_in + //! Device-accessible pointer to the corresponding input sequence of + //! associated value items + //! + //! @param[out] d_values_out + //! Device-accessible pointer to the correspondingly-reordered output + //! sequence of associated value items + //! + //! @param[in] num_items + //! The total number of items within the segmented array, including items not + //! covered by segments. `num_items` should match the largest element within + //! the range `[d_end_offsets, d_end_offsets + num_segments)`. + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! Random-access input iterator to the sequence of beginning offsets of + //! length `num_segments`, such that `d_begin_offsets[i]` is the first + //! element of the *i*th data segment in `d_keys_*` and `d_values_*` + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. If + //! ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for key comparison + //! + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key + //! comparison (e.g., `sizeof(unsigned int) * 8`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairs( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairs( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] - * - * @endcode - * - * @tparam KeyT - * **[inferred]** Key type - * - * @tparam ValueT - * **[inferred]** Value type - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in,out] d_values - * Double-buffer of values whose "current" device-accessible buffer - * contains the unsorted input values and, upon return, is updated to point - * to the sorted output values - * - * @param[in] num_items - * The total number of items within the segmented array, including items not - * covered by segments. `num_items` should match the largest element within - * the range `[d_end_offsets, d_end_offsets + num_segments)`. - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is - * considered empty. - * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for - * key comparison - * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key - * comparison (e.g., `sizeof(unsigned int) * 8`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required) + //! + //! - The sorting operation is given a pair of key buffers and a corresponding + //! pair of associated value buffers. Each pair is managed by a DoubleBuffer + //! structure that indicates which of the two buffers is "current" (and thus + //! contains the input data to be sorted). + //! - The contents of both buffers within each pair may be altered by the sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the number + //! of key bits specified and the targeted device architecture). + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both + //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is + //! specified as ``segment_offsets + 1``). + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and yield + //! a corresponding performance improvement. + //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` + //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range + //! ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, + //! ``d_values.Alternate()[i]`` will not be accessed nor modified. + //! - @devicestorageP + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of `int` keys with associated vector of ``int`` values. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a set of DoubleBuffers to wrap pairs of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedRadixSort::SortPairs( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedRadixSort::SortPairs( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer + //! contains the unsorted input values and, upon return, is updated to point + //! to the sorted output values + //! + //! @param[in] num_items + //! The total number of items within the segmented array, including items not + //! covered by segments. `num_items` should match the largest element within + //! the range `[d_end_offsets, d_end_offsets + num_segments)`. + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for key comparison + //! + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key + //! comparison (e.g., `sizeof(unsigned int) * 8`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairsDescending( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairsDescending( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] - * @endcode - * - * @tparam KeyT - * **[inferred]** Key type - * - * @tparam ValueT - * **[inferred]** Value type - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] d_values_in - * Device-accessible pointer to the corresponding input sequence of - * associated value items - * - * @param[out] d_values_out - * Device-accessible pointer to the correspondingly-reordered output - * sequence of associated value items - * - * @param[in] num_items - * The total number of items within the segmented array, including items not - * covered by segments. `num_items` should match the largest element within - * the range `[d_end_offsets, d_end_offsets + num_segments)`. - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th - * is considered empty. - * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for - * key comparison - * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key - * comparison (e.g., `sizeof(unsigned int) * 8`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required). + //! + //! - The contents of the input data are not altered by the sorting operation + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both + //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is + //! specified as ``segment_offsets + 1``). + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and + //! yield a corresponding performance improvement. + //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and `out` be any of + //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall + //! not overlap ``[in, in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see + //! the sorting interface using DoubleBuffer wrappers below. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, + //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedRadixSort::SortPairsDescending( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedRadixSort::SortPairsDescending( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] d_values_in + //! Device-accessible pointer to the corresponding input sequence of + //! associated value items + //! + //! @param[out] d_values_out + //! Device-accessible pointer to the correspondingly-reordered output + //! sequence of associated value items + //! + //! @param[in] num_items + //! The total number of items within the segmented array, including items not + //! covered by segments. `num_items` should match the largest element within + //! the range `[d_end_offsets, d_end_offsets + num_segments)`. + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for key comparison + //! + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key + //! comparison (e.g., `sizeof(unsigned int) * 8`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairsDescending( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairsDescending( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] - * @endcode - * - * @tparam KeyT - * **[inferred]** Key type - * - * @tparam ValueT - * **[inferred]** Value type - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in,out] d_values - * Double-buffer of values whose "current" device-accessible buffer - * contains the unsorted input values and, upon return, is updated to point - * to the sorted output values - * - * @param[in] num_items - * The total number of items within the segmented array, including items not - * covered by segments. `num_items` should match the largest element within - * the range `[d_end_offsets, d_end_offsets + num_segments)`. - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th - * is considered empty. - * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for - * key comparison - * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key - * comparison (e.g., `sizeof(unsigned int) * 8`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required). + //! + //! - The sorting operation is given a pair of key buffers and a corresponding + //! pair of associated value buffers. Each pair is managed by a DoubleBuffer + //! structure that indicates which of the two buffers is "current" (and thus + //! contains the input data to be sorted). + //! - The contents of both buffers within each pair may be altered by the + //! sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the number + //! of key bits specified and the targeted device architecture). + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both + //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is + //! specified as ``segment_offsets + 1``). + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and + //! yield a corresponding performance improvement. + //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` + //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range + //! ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, + //! ``d_values.Alternate()[i]`` will not be accessed nor modified. + //! not to be modified. + //! - @devicestorageP + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a set of DoubleBuffers to wrap pairs of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedRadixSort::SortPairsDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedRadixSort::SortPairsDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer + //! contains the unsorted input values and, upon return, is updated to point + //! to the sorted output values + //! + //! @param[in] num_items + //! The total number of items within the segmented array, including items not + //! covered by segments. `num_items` should match the largest element within + //! the range `[d_end_offsets, d_end_offsets + num_segments)`. + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for key comparison + //! + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key + //! comparison (e.g., `sizeof(unsigned int) * 8`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeys( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeys( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * - * @endcode - * - * @tparam KeyT - * **[inferred]** Key type - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of \p d_temp_storage allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] num_items - * The total number of items within the segmented array, including items not - * covered by segments. `num_items` should match the largest element within - * the range `[d_end_offsets, d_end_offsets + num_segments)`. - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is - * considered empty. - * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for - * key comparison - * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key - * comparison (e.g., `sizeof(unsigned int) * 8`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Sorts segments of keys into ascending order. (``~2N`` auxiliary storage required) + //! + //! - The contents of the input data are not altered by the sorting operation + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and + //! yield a corresponding performance improvement. + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both + //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter + //! is specified as ``segment_offsets + 1``). + //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap + //! ``[d_keys_in, d_keys_in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see + //! the sorting interface using DoubleBuffer wrappers below. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not + //! be accessed nor modified. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of `int` keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedRadixSort::SortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedRadixSort::SortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. + //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] num_items + //! The total number of items within the segmented array, including items not + //! covered by segments. `num_items` should match the largest element within + //! the range `[d_end_offsets, d_end_offsets + num_segments)`. + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for key comparison + //! + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key + //! comparison (e.g., `sizeof(unsigned int) * 8`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -1165,132 +1155,136 @@ struct DeviceSegmentedRadixSort stream); } - /** - * @brief Sorts segments of keys into ascending order. (~N auxiliary storage required). - * - * @par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" - * indicator within the DoubleBuffer wrapper to reference which of the two - * buffers now contains the sorted output sequence (a function of the - * number of key bits specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased for both - * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter - * is specified as `segment_offsets + 1`). - * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - * bits can be specified. This can reduce overall sorting overhead and - * yield a corresponding performance improvement. - * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap - * `[alt, alt + num_items)`. Both ranges shall not overlap - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. - * - @devicestorageP - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of `int` keys. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeys( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeys( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * - * @endcode - * - * @tparam KeyT - * **[inferred]** Key type - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in] num_items - * The total number of items within the segmented array, including items not - * covered by segments. `num_items` should match the largest element within - * the range `[d_end_offsets, d_end_offsets + num_segments)`. - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1` <= d_begin_offsets[i]`, the *i*th - * is considered empty. - * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) - * needed for key comparison - * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key - * comparison (e.g., `sizeof(unsigned int) * 8`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required). + //! + //! - The sorting operation is given a pair of key buffers managed by a + //! DoubleBuffer structure that indicates which of the two buffers is + //! "current" (and thus contains the input data to be sorted). + //! - The contents of both buffers may be altered by the sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the + //! number of key bits specified and the targeted device architecture). + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both + //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter + //! is specified as ``segment_offsets + 1``). + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and + //! yield a corresponding performance improvement. + //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. + //! The range ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. + //! - @devicestorageP + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``int`` keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a DoubleBuffer to wrap the pair of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedRadixSort::SortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedRadixSort::SortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in] num_items + //! The total number of items within the segmented array, including items not + //! covered by segments. `num_items` should match the largest element within + //! the range `[d_end_offsets, d_end_offsets + num_segments)`. + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) + //! needed for key comparison + //! + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key + //! comparison (e.g., `sizeof(unsigned int) * 8`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -1363,127 +1357,126 @@ struct DeviceSegmentedRadixSort stream); } - /** - * @brief Sorts segments of keys into descending order. - * (`~2N` auxiliary storage required). - * - * @par - * - The contents of the input data are not altered by the sorting operation - * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased for both - * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter - * is specified as `segment_offsets + 1`). - * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - * bits can be specified. This can reduce overall sorting overhead and - * yield a corresponding performance improvement. - * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - @devicestorageNP For sorting using only `O(P)` temporary storage, see - * the sorting interface using DoubleBuffer wrappers below. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not - * be accessed nor modified. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of `int` keys. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * - * @endcode - * - * @tparam KeyT - * **[inferred]** Key type - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] num_items - * The total number of items within the segmented array, including items not - * covered by segments. `num_items` should match the largest element within - * the range `[d_end_offsets, d_end_offsets + num_segments)`. - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is - * considered empty. - * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for - * key comparison - * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key - * comparison (e.g., sizeof(unsigned int) * 8) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required). + //! + //! - The contents of the input data are not altered by the sorting operation + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both + //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter + //! is specified as ``segment_offsets + 1``). + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and + //! yield a corresponding performance improvement. + //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap + //! ``[d_keys_in, d_keys_in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see + //! the sorting interface using DoubleBuffer wrappers below. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not + //! be accessed nor modified. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``int`` keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a DoubleBuffer to wrap the pair of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedRadixSort::SortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedRadixSort::SortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] num_items + //! The total number of items within the segmented array, including items not + //! covered by segments. `num_items` should match the largest element within + //! the range `[d_end_offsets, d_end_offsets + num_segments)`. + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for key comparison + //! + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key + //! comparison (e.g., sizeof(unsigned int) * 8) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -1559,132 +1552,134 @@ struct DeviceSegmentedRadixSort stream); } - /** - * @brief Sorts segments of keys into descending order. - * (`~N` auxiliary storage required). - * - * @par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" - * indicator within the DoubleBuffer wrapper to reference which of the two - * buffers now contains the sorted output sequence (a function of the - * number of key bits specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased - * for both the `d_begin_offsets` and `d_end_offsets` parameters (where - * the latter is specified as `segment_offsets + 1`). - * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - * bits can be specified. This can reduce overall sorting overhead and - * yield a corresponding performance improvement. - * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap - * `[alt, alt + num_items)`. Both ranges shall not overlap - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. - * - @devicestorageP - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of `int` keys. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * @endcode - * - * @tparam KeyT - * **[inferred]** Key type - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in] num_items - * The total number of items within the segmented array, including items not - * covered by segments. `num_items` should match the largest element within - * the range `[d_end_offsets, d_end_offsets + num_segments)`. - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i], the *i*th is - * considered empty. - * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for - * key comparison - * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key - * comparison (e.g., `sizeof(unsigned int) * 8`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required). + //! + //! - The sorting operation is given a pair of key buffers managed by a + //! DoubleBuffer structure that indicates which of the two buffers is + //! "current" (and thus contains the input data to be sorted). + //! - The contents of both buffers may be altered by the sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the + //! number of key bits specified and the targeted device architecture). + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and + //! yield a corresponding performance improvement. + //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. + //! The range ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. + //! - @devicestorageP + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of `int` keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a DoubleBuffer to wrap the pair of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedRadixSort::SortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedRadixSort::SortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in] num_items + //! The total number of items within the segmented array, including items not + //! covered by segments. `num_items` should match the largest element within + //! the range `[d_end_offsets, d_end_offsets + num_segments)`. + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for key comparison + //! + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key + //! comparison (e.g., `sizeof(unsigned int) * 8`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -1757,9 +1752,7 @@ struct DeviceSegmentedRadixSort stream); } - //@} end member group + //! @} end member group }; CUB_NAMESPACE_END - - diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index a227a2f1e9..861d6c6b6b 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -26,11 +26,9 @@ * ******************************************************************************/ -/** - * @file cub::DeviceSegmentedReduce provides device-wide, parallel operations - * for computing a batched reduction across multiple sequences of data - * items residing within device-accessible memory. - */ +//! @file cub::DeviceSegmentedReduce provides device-wide, parallel operations +//! for computing a batched reduction across multiple sequences of data +//! items residing within device-accessible memory. #pragma once @@ -55,155 +53,155 @@ CUB_NAMESPACE_BEGIN -/** - * @brief DeviceSegmentedReduce provides device-wide, parallel operations for - * computing a reduction across multiple sequences of data items - * residing within device-accessible memory. ![](reduce_logo.png) - * @ingroup SegmentedModule - * - * @par Overview - * A *reduction* - * (or *fold*) uses a binary combining operator to compute a single aggregate - * from a sequence of input elements. - * - * @par Usage Considerations - * @cdp_class{DeviceSegmentedReduce} - * - */ +//! @rst +//! DeviceSegmentedReduce provides device-wide, parallel operations for +//! computing a reduction across multiple sequences of data items +//! residing within device-accessible memory. +//! +//! Overview +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! A `reduction `_ +//! (or *fold*) uses a binary combining operator to compute a single aggregate +//! from a sequence of input elements. +//! +//! Usage Considerations +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! @cdp_class{DeviceSegmentedReduce} +//! +//! @endrst struct DeviceSegmentedReduce { - /** - * @brief Computes a device-wide segmented reduction using the specified - * binary `reduction_op` functor. - * - * @par - * - Does not support binary reduction operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased - * for both the `d_begin_offsets` and `d_end_offsets` parameters (where - * the latter is specified as `segment_offsets + 1`). - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not - * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates a custom min-reduction of a device - * vector of `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * CustomMin min_op; - * int initial_value; // e.g., INT_MAX - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Reduce( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduction - * cub::DeviceSegmentedReduce::Reduce( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); - * - * // d_out <-- [6, INT_MAX, 0] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced - * aggregate \iterator - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @tparam ReductionOpT - * **[inferred]** Binary reduction functor type having member - * `T operator()(const T &a, const T &b)` - * - * @tparam T - * **[inferred]** Data element type that is convertible to the `value` type - * of `InputIteratorT` - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no - * work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of \p d_temp_storage allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is - * considered empty. - * - * @param[in] reduction_op - * Binary reduction functor - * - * @param[in] initial_value - * Initial value of the reduction for each segment - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes a device-wide segmented reduction using the specified + //! binary ``reduction_op`` functor. + //! + //! - Does not support binary reduction operators that are non-commutative. + //! - Provides "run-to-run" determinism for pseudo-associative reduction + //! (e.g., addition of floating point types) on the same GPU device. + //! However, results for pseudo-associative reduction may be inconsistent + //! from one device to a another device of a different compute-capability + //! because CUB can employ different tile-sizing for different architectures. + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - Let ``s`` be in ``[0, num_segments)``. The range + //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not + //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates a custom min-reduction of a device vector of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // CustomMin functor + //! struct CustomMin + //! { + //! template + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! T operator()(const T &a, const T &b) const { + //! return (b < a) ? b : a; + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [-, -, -] + //! CustomMin min_op; + //! int initial_value; // e.g., INT_MAX + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedReduce::Reduce( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run reduction + //! cub::DeviceSegmentedReduce::Reduce( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + //! + //! // d_out <-- [6, INT_MAX, 0] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator + //! + //! @tparam ReductionOpT + //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam T + //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] reduction_op + //! Binary reduction functor + //! + //! @param[in] initial_value + //! Initial value of the reduction for each segment + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Sum( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sum-reduction - * cub::DeviceSegmentedReduce::Sum( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [21, 0, 17] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate - * \iterator - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is - * considered empty. - * - * @param[in] stream - * **[optional] CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes a device-wide segmented sum using the addition (``+``) operator. + //! + //! - Uses ``0`` as the initial value of the reduction for each segment. + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - Does not support ``+`` operators that are non-commutative. + //! - Let ``s`` be in ``[0, num_segments)``. The range + //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not + //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [-, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedReduce::Sum( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sum-reduction + //! cub::DeviceSegmentedReduce::Sum( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // d_out <-- [21, 0, 17] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and + //! ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template ::max()` as the initial value of the - * reduction for each segment. - * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased for both - * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is - * specified as `segment_offsets + 1`). - * - Does not support `<` operators that are non-commutative. - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not - * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the min-reduction of a device vector of - * `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Min( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run min-reduction - * cub::DeviceSegmentedReduce::Min( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [6, INT_MAX, 0] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced - * aggregate \iterator - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is - * considered empty. - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes a device-wide segmented minimum using the less-than (``<``) operator. + //! + //! - Uses ``std::numeric_limits::max()`` as the initial value of the reduction for each segment. + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both + //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is + //! specified as ``segment_offsets + 1``). + //! - Does not support ``<`` operators that are non-commutative. + //! - Let ``s`` be in ``[0, num_segments)``. The range + //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not + //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [-, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedReduce::Min( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run min-reduction + //! cub::DeviceSegmentedReduce::Min( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // d_out <-- [6, INT_MAX, 0] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template ` - * (assuming the value type of `d_in` is `T`) - * - The minimum of the *i*th segment is written to - * `d_out[i].value` and its offset in that segment is written to - * `d_out[i].key`. - * - The `{1, std::numeric_limits::max()}` tuple is produced for - * zero-length inputs - * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased for both - * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter - * is specified as `segment_offsets + 1`). - * - Does not support `<` operators that are non-commutative. - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not - * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector - * of `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::ArgMin( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmin-reduction - * cub::DeviceSegmentedReduce::ArgMin( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input items - * (of some type `T`) \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate - * (having value type `KeyValuePair`) \iterator - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the - * *i*th is considered empty. - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Finds the first device-wide minimum in each segment using the + //! less-than (``<``) operator, also returning the in-segment index of that item. + //! + //! - The output value type of ``d_out`` is ``cub::KeyValuePair`` + //! (assuming the value type of ``d_in`` is ``T``) + //! + //! - The minimum of the *i*\ :sup:`th` segment is written to + //! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``. + //! - The ``{1, std::numeric_limits::max()}`` tuple is produced for zero-length inputs + //! + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both + //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter + //! is specified as ``segment_offsets + 1``). + //! - Does not support ``<`` operators that are non-commutative. + //! - Let ``s`` be in ``[0, num_segments)``. The range + //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not + //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedReduce::ArgMin( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run argmin-reduction + //! cub::DeviceSegmentedReduce::ArgMin( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate + //! (having value type `KeyValuePair`) @iterator + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template `) operator. - * - * @par - * - Uses `std::numeric_limits::lowest()` as the initial value of the - * reduction. - * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased - * for both the `d_begin_offsets` and `d_end_offsets` parameters (where - * the latter is specified as `segment_offsets + 1`). - * - Does not support `>` operators that are non-commutative. - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not - * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the max-reduction of a device vector - * of `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Max( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run max-reduction - * cub::DeviceSegmentedReduce::Max( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [8, INT_MIN, 9] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced - * aggregate \iterator - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is - * considered empty. - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Computes a device-wide segmented maximum using the greater-than (``>``) operator. + //! + //! - Uses ``std::numeric_limits::lowest()`` as the initial value of the reduction. + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - Does not support ``>`` operators that are non-commutative. + //! - Let ``s`` be in ``[0, num_segments)``. The range + //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not + //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_out; // e.g., [-, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedReduce::Max( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run max-reduction + //! cub::DeviceSegmentedReduce::Max( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // d_out <-- [8, INT_MIN, 9] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template ') operator, also returning the in-segment index of - * that item - * - * @par - * - The output value type of `d_out` is `cub::KeyValuePair` - * (assuming the value type of `d_in` is `T`) - * - The maximum of the *i*th segment is written to - * `d_out[i].value` and its offset in that segment is written to - * `d_out[i].key`. - * - The `{1, std::numeric_limits::lowest()}` tuple is produced for - * zero-length inputs - * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased - * for both the `d_begin_offsets` and `d_end_offsets` parameters (where - * the latter is specified as `segment_offsets + 1`). - * - Does not support `>` operators that are non-commutative. - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not - * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)`. - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector - * of `int` data elements. - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::ArgMax( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmax-reduction - * cub::DeviceSegmentedReduce::ArgMax( - * d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input items - * (of some type `T`) \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate - * (having value type `KeyValuePair`) \iterator - * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output aggregate - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is - * considered empty. - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Finds the first device-wide maximum in each segment using the + //! greater-than (``>``) operator, also returning the in-segment index of that item + //! + //! - The output value type of ``d_out`` is ``cub::KeyValuePair`` + //! (assuming the value type of ``d_in`` is ``T``) + //! + //! - The maximum of the *i*\ :sup:`th` segment is written to + //! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``. + //! - The ``{1, std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs + //! + //! - When input a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - Does not support ``>`` operators that are non-commutative. + //! - Let ``s`` be in ``[0, num_segments)``. The range + //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not + //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)``. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the argmax-reduction of a device vector + //! of `int` data elements. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedReduce::ArgMax( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run argmax-reduction + //! cub::DeviceSegmentedReduce::ArgMax( + //! d_temp_storage, temp_storage_bytes, d_in, d_out, + //! num_segments, d_offsets, d_offsets + 1); + //! + //! // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items + //! (of some type `T`) @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Output iterator type for recording the reduced aggregate + //! (having value type `KeyValuePair`) @iterator + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output aggregate + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length `num_segments`, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::SortPairs( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::SortPairs( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] - * @endcode - */ +//! @rst +//! DeviceSegmentedSort provides device-wide, parallel operations for +//! computing a batched sort across multiple, non-overlapping sequences of +//! data items residing within device-accessible memory. +//! +//! Overview +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! The algorithm arranges items into ascending (or descending) order. +//! The underlying sorting algorithm is undefined. Depending on the segment size, +//! it might be radix sort, merge sort or something else. Therefore, no +//! assumptions on the underlying implementation should be made. +//! +//! Differences from DeviceSegmentedRadixSort +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! DeviceSegmentedRadixSort is optimized for significantly large segments (tens +//! of thousands of items and more). Nevertheless, some domains produce a wide +//! range of segment sizes. DeviceSegmentedSort partitions segments into size +//! groups and specialize sorting algorithms for each group. This approach leads +//! to better resource utilization in the presence of segment size imbalance or +//! moderate segment sizes (up to thousands of items). +//! This algorithm is more complex and consists of multiple kernels. This fact +//! leads to longer compilation times as well as larger binaries sizes. +//! +//! Supported Types +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! The algorithm has to satisfy the underlying algorithms restrictions. Radix +//! sort usage restricts the list of supported types. Therefore, +//! DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types +//! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half`` and +//! ``__nv_bfloat16`` 16-bit floating-point types. +//! +//! Segments are not required to be contiguous. Any element of input(s) or +//! output(s) outside the specified segments will not be accessed nor modified. +//! +//! A simple example +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! .. code-block:: c++ +//! +//! #include +//! // or equivalently +//! +//! // Declare, allocate, and initialize device-accessible pointers +//! // for sorting data +//! int num_items; // e.g., 7 +//! int num_segments; // e.g., 3 +//! int *d_offsets; // e.g., [0, 3, 3, 7] +//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] +//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] +//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] +//! int *d_values_out; // e.g., [-, -, -, -, -, -, -] +//! ... +//! +//! // Determine temporary device storage requirements +//! void *d_temp_storage = NULL; +//! size_t temp_storage_bytes = 0; +//! cub::DeviceSegmentedSort::SortPairs( +//! d_temp_storage, temp_storage_bytes, +//! d_keys_in, d_keys_out, d_values_in, d_values_out, +//! num_items, num_segments, d_offsets, d_offsets + 1); +//! +//! // Allocate temporary storage +//! cudaMalloc(&d_temp_storage, temp_storage_bytes); +//! +//! // Run sorting operation +//! cub::DeviceSegmentedSort::SortPairs( +//! d_temp_storage, temp_storage_bytes, +//! d_keys_in, d_keys_out, d_values_in, d_values_out, +//! num_items, num_segments, d_offsets, d_offsets + 1); +//! +//! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] +//! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] +//! +//! @endrst struct DeviceSegmentedSort { - - /*************************************************************************//** - * @name Keys-only - ****************************************************************************/ - //@{ - - /** - * @brief Sorts segments of keys into ascending order. Approximately - * `num_items + 2*num_segments` auxiliary storage required. - * - * @par - * - The contents of the input data are not altered by the sorting operation. - * - When the input is a contiguous sequence of segments, a single sequence - * @p segment_offsets (of length `num_segments+1`) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as `segment_offsets+1`). - * - SortKeys is not guaranteed to be stable. That is, suppose that @p i and - * @p j are equivalent: neither one is less than the other. It is not - * guaranteed that the relative order of these two elements will be - * preserved by sort. - * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not - * be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible - * // pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::SortKeys( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::SortKeys( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @name Keys-only + //! @{ + + //! @rst + //! Sorts segments of keys into ascending order. + //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required. + //! + //! - The contents of the input data are not altered by the sorting operation. + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as `segment_offsets+1`). + //! - SortKeys is not guaranteed to be stable. That is, suppose that ``i`` and + //! ``j`` are equivalent: neither one is less than the other. It is not + //! guaranteed that the relative order of these two elements will be + //! preserved by sort. + //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap + //! ``[d_keys_in, d_keys_in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not + //! be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``int`` keys. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible + //! // pointers for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::SortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::SortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -303,112 +307,113 @@ struct DeviceSegmentedSort stream); } - /** - * @brief Sorts segments of keys into descending order. Approximately - * `num_items + 2*num_segments` auxiliary storage required. - * - * @par - * - The contents of the input data are not altered by the sorting operation. - * - When the input is a contiguous sequence of segments, a single sequence - * @p segment_offsets (of length `num_segments + 1`) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as `segment_offsets + 1`). - * - SortKeysDescending is not guaranteed to be stable. That is, suppose that - * @p i and @p j are equivalent: neither one is less than the other. It is - * not guaranteed that the relative order of these two elements will be - * preserved by sort. - * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not - * be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::SortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::SortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no - * work is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i] - 1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of keys into descending order. Approximately + //! ``num_items + 2 * num_segments`` auxiliary storage required. + //! + //! - The contents of the input data are not altered by the sorting operation. + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that + //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is + //! not guaranteed that the relative order of these two elements will be + //! preserved by sort. + //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap + //! ``[d_keys_in, d_keys_in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not + //! be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::SortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::SortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no work is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -477,122 +482,125 @@ struct DeviceSegmentedSort stream); } - /** - * @brief Sorts segments of keys into ascending order. Approximately - * `2*num_segments` auxiliary storage required. - * - * @par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" - * indicator within the DoubleBuffer wrapper to reference which of the two - * buffers now contains the sorted output sequence (a function of the number - * of key bits and the targeted device architecture). - * - When the input is a contiguous sequence of segments, a single sequence - * @p segment_offsets (of length `num_segments+1`) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as `segment_offsets+1`). - * - SortKeys is not guaranteed to be stable. That is, suppose that - * @p i and @p j are equivalent: neither one is less than the other. It is - * not guaranteed that the relative order of these two elements will be - * preserved by sort. - * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap - * `[alt, alt + num_items)`. Both ranges shall not overlap - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible - * // pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::SortKeys( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::SortKeys( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no - * work is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` - * and `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i] - 1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of keys into ascending order. Approximately ``2 * num_segments`` auxiliary storage required. + //! + //! - The sorting operation is given a pair of key buffers managed by a + //! DoubleBuffer structure that indicates which of the two buffers is + //! "current" (and thus contains the input data to be sorted). + //! - The contents of both buffers may be altered by the sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the number + //! of key bits and the targeted device architecture). + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets +1``). + //! - SortKeys is not guaranteed to be stable. That is, suppose that + //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is + //! not guaranteed that the relative order of these two elements will be + //! preserved by sort. + //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. + //! The range ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible + //! // pointers for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a DoubleBuffer to wrap the pair of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::SortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::SortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no + //! work is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -658,122 +666,126 @@ struct DeviceSegmentedSort stream); } - /** - * @brief Sorts segments of keys into descending order. Approximately - * `2*num_segments` auxiliary storage required. - * - * @par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" - * indicator within the DoubleBuffer wrapper to reference which of the two - * buffers now contains the sorted output sequence (a function of the number - * of key bits and the targeted device architecture). - * - When the input is a contiguous sequence of segments, a single sequence - * @p segment_offsets (of length `num_segments + 1`) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as `segment_offsets + 1`). - * - SortKeysDescending is not guaranteed to be stable. That is, suppose that - * @p i and @p j are equivalent: neither one is less than the other. It is - * not guaranteed that the relative order of these two elements will be - * preserved by sort. - * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap - * `[alt, alt + num_items)`. Both ranges shall not overlap - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::SortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::SortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i] - 1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1<= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of keys into descending order. Approximately + //! ``2 * num_segments`` auxiliary storage required. + //! + //! - The sorting operation is given a pair of key buffers managed by a + //! DoubleBuffer structure that indicates which of the two buffers is + //! "current" (and thus contains the input data to be sorted). + //! - The contents of both buffers may be altered by the sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the number + //! of key bits and the targeted device architecture). + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that + //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is + //! not guaranteed that the relative order of these two elements will be + //! preserved by sort. + //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. + //! The range ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a DoubleBuffer to wrap the pair of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::SortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::SortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1<= d_begin_offsets[i]``, the ``i``-th segment is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -839,113 +851,117 @@ struct DeviceSegmentedSort stream); } - /** - * @brief Sorts segments of keys into ascending order. Approximately - * `num_items + 2*num_segments` auxiliary storage required. - * - * @par - * - The contents of the input data are not altered by the sorting operation. - * - When the input is a contiguous sequence of segments, a single sequence - * @p segment_offsets (of length `num_segments+1`) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as `segment_offsets+1`). - * - StableSortKeys is stable: it preserves the relative ordering of - * equivalent elements. That is, if @p x and @p y are elements such that - * @p x precedes @p y, and if the two elements are equivalent (neither - * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that - * @p x still precedes @p y. - * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not - * be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::StableSortKeys( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::StableSortKeys( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of keys into ascending order. Approximately + //! ``num_items + 2 * num_segments`` auxiliary storage required. + //! + //! - The contents of the input data are not altered by the sorting operation. + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - StableSortKeys is stable: it preserves the relative ordering of + //! equivalent elements. That is, if ``x`` and ``y`` are elements such that + //! ``x`` precedes ``y``, and if the two elements are equivalent (neither + //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that + //! ``x`` still precedes ``y``. + //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap + //! ``[d_keys_in, d_keys_in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not + //! be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::StableSortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::StableSortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -1002,113 +1018,117 @@ struct DeviceSegmentedSort stream); } - /** - * @brief Sorts segments of keys into descending order. Approximately - * `num_items + 2*num_segments` auxiliary storage required. - * - * @par - * - The contents of the input data are not altered by the sorting operation. - * - When the input is a contiguous sequence of segments, a single sequence - * @p segment_offsets (of length `num_segments+1`) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as `segment_offsets+1`). - * - StableSortKeysDescending is stable: it preserves the relative ordering of - * equivalent elements. That is, if @p x and @p y are elements such that - * @p x precedes @p y, and if the two elements are equivalent (neither - * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that - * @p x still precedes @p y. - * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not - * be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::StableSortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::StableSortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of keys into descending order. + //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required. + //! + //! - The contents of the input data are not altered by the sorting operation. + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - StableSortKeysDescending is stable: it preserves the relative ordering of + //! equivalent elements. That is, if ``x`` and ``y`` are elements such that + //! ``x`` precedes ``y``, and if the two elements are equivalent (neither ``x < y`` nor ``y < x``) + //! then a postcondition of stable sort is that ``x`` still precedes ``y``. + //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap + //! ``[d_keys_in, d_keys_in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not + //! be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::StableSortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::StableSortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and + //! ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -1167,123 +1187,128 @@ struct DeviceSegmentedSort stream); } - /** - * @brief Sorts segments of keys into ascending order. Approximately - * `2*num_segments` auxiliary storage required. - * - * @par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" - * indicator within the DoubleBuffer wrapper to reference which of the two - * buffers now contains the sorted output sequence (a function of the number - * of key bits and the targeted device architecture). - * - When the input is a contiguous sequence of segments, a single sequence - * @p segment_offsets (of length `num_segments+1`) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as `segment_offsets+1`). - * - StableSortKeys is stable: it preserves the relative ordering of - * equivalent elements. That is, if @p x and @p y are elements such that - * @p x precedes @p y, and if the two elements are equivalent (neither - * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that - * @p x still precedes @p y. - * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap - * `[alt, alt + num_items)`. Both ranges shall not overlap - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::StableSortKeys( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::StableSortKeys( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i] - 1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of keys into ascending order. + //! Approximately ``2 * num_segments`` auxiliary storage required. + //! + //! - The sorting operation is given a pair of key buffers managed by a + //! DoubleBuffer structure that indicates which of the two buffers is + //! "current" (and thus contains the input data to be sorted). + //! - The contents of both buffers may be altered by the sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the number + //! of key bits and the targeted device architecture). + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - StableSortKeys is stable: it preserves the relative ordering of + //! equivalent elements. That is, if ``x`` and ``y`` are elements such that + //! ``x`` precedes ``y``, and if the two elements are equivalent (neither + //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that + //! ``x`` still precedes ``y``. + //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. + //! The range ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a DoubleBuffer to wrap the pair of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::StableSortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::StableSortKeys( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -1336,123 +1361,127 @@ struct DeviceSegmentedSort stream); } - /** - * @brief Sorts segments of keys into descending order. Approximately - * `2*num_segments` auxiliary storage required. - * - * @par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" - * indicator within the DoubleBuffer wrapper to reference which of the two - * buffers now contains the sorted output sequence (a function of the number - * of key bits and the targeted device architecture). - * - When the input is a contiguous sequence of segments, a single sequence - * @p segment_offsets (of length `num_segments+1`) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as `segment_offsets+1`). - * - StableSortKeysDescending is stable: it preserves the relative ordering of - * equivalent elements. That is, if @p x and @p y are elements such that - * @p x precedes @p y, and if the two elements are equivalent (neither - * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that - * @p x still precedes @p y. - * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap - * `[alt, alt + num_items)`. Both ranges shall not overlap - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::StableSortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::StableSortKeysDescending( - * d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that d_end_offsets[i]-1 is the last - * element of the ith data segment in `d_keys_*` and - * `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the - * i-th segment is considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of keys into descending order. + //! Approximately ``2 * num_segments`` auxiliary storage required. + //! + //! - The sorting operation is given a pair of key buffers managed by a + //! DoubleBuffer structure that indicates which of the two buffers is + //! "current" (and thus contains the input data to be sorted). + //! - The contents of both buffers may be altered by the sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the number + //! of key bits and the targeted device architecture). + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - StableSortKeysDescending is stable: it preserves the relative ordering of + //! equivalent elements. That is, if ``x`` and ``y`` are elements such that + //! ``x`` precedes ``y``, and if the two elements are equivalent (neither + //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that + //! ``x`` still precedes ``y``. + //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. + //! The range ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ```i` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a DoubleBuffer to wrap the pair of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::StableSortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::StableSortKeysDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and + //! ``d_values_*``. If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the + //! ``i``-th segment is considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -1507,137 +1536,139 @@ struct DeviceSegmentedSort stream); } - //@} end member group - /*************************************************************************//** - * @name Key-value pairs - ****************************************************************************/ - //@{ - - /** - * @brief Sorts segments of key-value pairs into ascending order. - * Approximately `2*num_items + 2*num_segments` auxiliary storage - * required. - * - * @par - * - The contents of the input data are not altered by the sorting operation. - * - When the input is a contiguous sequence of segments, a single sequence - * @p segment_offsets (of length `num_segments+1`) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as `segment_offsets+1`). - * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and - * @p j are equivalent: neither one is less than the other. It is not - * guaranteed that the relative order of these two elements will be - * preserved by sort. - * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of - * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall - * not overlap `[in, in + num_items)`, - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, - * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys with associated vector of - * @p int values. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::SortPairs( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::SortPairs( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam ValueT - * [inferred] Value type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] d_values_in - * Device-accessible pointer to the corresponding input sequence of - * associated value items - * - * @param[out] d_values_out - * Device-accessible pointer to the correspondingly-reordered output - * sequence of associated value items - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @} end member group + //! @name Key-value pairs + //! @{ + + //! @rst + //! Sorts segments of key-value pairs into ascending order. + //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. + //! + //! - The contents of the input data are not altered by the sorting operation. + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and + //! ``j`` are equivalent: neither one is less than the other. It is not + //! guaranteed that the relative order of these two elements will be + //! preserved by sort. + //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of + //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall + //! not overlap ``[in, in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, + //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys with associated vector of + //! ``i`` nt values. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::SortPairs( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::SortPairs( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] d_values_in + //! Device-accessible pointer to the corresponding input sequence of + //! associated value items + //! + //! @param[out] d_values_out + //! Device-accessible pointer to the correspondingly-reordered output + //! sequence of associated value items + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::SortPairsDescending( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::SortPairsDescending( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam ValueT - * [inferred] Value type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] d_values_in - * Device-accessible pointer to the corresponding input sequence of - * associated value items - * - * @param[out] d_values_out - * Device-accessible pointer to the correspondingly-reordered output - * sequence of associated value items - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into descending order. + //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. + //! + //! - The contents of the input data are not altered by the sorting operation. + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and + //! ``j`` are equivalent: neither one is less than the other. It is not + //! guaranteed that the relative order of these two elements will be + //! preserved by sort. + //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of + //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall + //! not overlap ``[in, in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, + //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys with associated vector of + //! ``i`` nt values. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::SortPairsDescending( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::SortPairsDescending( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] d_values_in + //! Device-accessible pointer to the corresponding input sequence of + //! associated value items + //! + //! @param[out] d_values_out + //! Device-accessible pointer to the correspondingly-reordered output + //! sequence of associated value items + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::SortPairs( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::SortPairs( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] - * - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam ValueT - * [inferred] Value type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in,out] d_values - * Double-buffer of values whose "current" device-accessible buffer contains - * the unsorted input values and, upon return, is updated to point to the - * sorted output values - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into ascending order. + //! Approximately ``2 * num_segments`` auxiliary storage required. + //! + //! - The sorting operation is given a pair of key buffers and a corresponding + //! pair of associated value buffers. Each pair is managed by a DoubleBuffer + //! structure that indicates which of the two buffers is "current" (and thus + //! contains the input data to be sorted). + //! - The contents of both buffers within each pair may be altered by the sorting + //! operation. + //! - Upon completion, the sorting operation will update the "current" indicator + //! within each DoubleBuffer wrapper to reference which of the two buffers + //! now contains the sorted output sequence (a function of the number of key bits + //! specified and the targeted device architecture). + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and + //! ``j`` are equivalent: neither one is less than the other. It is not + //! guaranteed that the relative order of these two elements will be + //! preserved by sort. + //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` + //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range + //! ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, + //! ``d_values.Alternate()[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys with associated vector of + //! ``i`` nt values. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a set of DoubleBuffers to wrap pairs of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::SortPairs( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::SortPairs( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer contains + //! the unsorted input values and, upon return, is updated to point to the + //! sorted output values + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template num_segments+1) can be aliased - * for both the @p d_begin_offsets and @p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - SortPairsDescending is not guaranteed to be stable. That is, suppose that - * @p i and @p j are equivalent: neither one is less than the other. It is - * not guaranteed that the relative order of these two elements will be - * preserved by sort. - * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` - * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range - * `[cur, cur + num_items)` shall not overlap - * `[alt, alt + num_items)`. Both ranges shall not overlap - * `[d_begin_offsets, d_begin_offsets + num_segments)` nor - * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, - * `d_values.Alternate()[i]` will not be accessed nor modified. - * - * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of @p int keys with associated vector of - * @p int values. - * - * @par - * @code - * #include - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for - * // sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::SortPairsDescending( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::SortPairsDescending( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] - * - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam ValueT - * [inferred] Value type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in,out] d_values - * Double-buffer of values whose "current" device-accessible buffer contains - * the unsorted input values and, upon return, is updated to point to the - * sorted output values - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into descending order. + //! Approximately ``2 * num_segments`` auxiliary storage required. + //! + //! - The sorting operation is given a pair of key buffers and a corresponding + //! pair of associated value buffers. Each pair is managed by a DoubleBuffer + //! structure that indicates which of the two buffers is "current" (and thus + //! contains the input data to be sorted). + //! - The contents of both buffers within each pair may be altered by the + //! sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the number + //! of key bits specified and the targeted device architecture). + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - SortPairsDescending is not guaranteed to be stable. That is, suppose that + //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is + //! not guaranteed that the relative order of these two elements will be + //! preserved by sort. + //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` + //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range + //! ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, + //! ``d_values.Alternate()[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys with associated vector of + //! ``i`` nt values. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for + //! // sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a set of DoubleBuffers to wrap pairs of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::SortPairsDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::SortPairsDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer contains + //! the unsorted input values and, upon return, is updated to point to the + //! sorted output values + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::StableSortPairs( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::StableSortPairs( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam ValueT - * [inferred] Value type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When nullptr, the - * required allocation size is written to @p temp_storage_bytes and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] d_values_in - * Device-accessible pointer to the corresponding input sequence of - * associated value items - * - * @param[out] d_values_out - * Device-accessible pointer to the correspondingly-reordered output - * sequence of associated value items - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into ascending order. + //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. + //! + //! - The contents of the input data are not altered by the sorting operation. + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - StableSortPairs is stable: it preserves the relative ordering of + //! equivalent elements. That is, if ``x`` and ``y`` are elements such that + //! ``x`` precedes ``y``, and if the two elements are equivalent (neither + //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that + //! ``x`` still precedes ``y``. + //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of + //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall + //! not overlap ``[in, in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, + //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys with associated vector of + //! ``i`` nt values. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::StableSortPairs( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::StableSortPairs( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When nullptr, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] d_values_in + //! Device-accessible pointer to the corresponding input sequence of + //! associated value items + //! + //! @param[out] d_values_out + //! Device-accessible pointer to the correspondingly-reordered output + //! sequence of associated value items + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::StableSortPairsDescending( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::StableSortPairsDescending( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam ValueT - * [inferred] Value type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_keys_in - * Device-accessible pointer to the input data of key data to sort - * - * @param[out] d_keys_out - * Device-accessible pointer to the sorted output sequence of key data - * - * @param[in] d_values_in - * Device-accessible pointer to the corresponding input sequence of - * associated value items - * - * @param[out] d_values_out - * Device-accessible pointer to the correspondingly-reordered output - * sequence of associated value items - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into descending order. + //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. + //! + //! - The contents of the input data are not altered by the sorting operation. + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - StableSortPairsDescending is stable: it preserves the relative ordering + //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that + //! ``x`` precedes ``y``, and if the two elements are equivalent (neither + //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that + //! ``x`` still precedes ``y``. + //! - Let `in` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of + //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall + //! not overlap ``[in, in + num_items)``, + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, + //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys with associated vector of + //! ``i`` nt values. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::StableSortPairsDescending( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::StableSortPairsDescending( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_keys_out, d_values_in, d_values_out, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Device-accessible pointer to the input data of key data to sort + //! + //! @param[out] d_keys_out + //! Device-accessible pointer to the sorted output sequence of key data + //! + //! @param[in] d_values_in + //! Device-accessible pointer to the corresponding input sequence of + //! associated value items + //! + //! @param[out] d_values_out + //! Device-accessible pointer to the correspondingly-reordered output + //! sequence of associated value items + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::StableSortPairs( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::StableSortPairs( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] - * - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam ValueT - * [inferred] Value type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in,out] d_values - * Double-buffer of values whose "current" device-accessible buffer contains - * the unsorted input values and, upon return, is updated to point to the - * sorted output values - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into ascending order. + //! Approximately ``2 * num_segments`` auxiliary storage required. + //! + //! - The sorting operation is given a pair of key buffers and a corresponding + //! pair of associated value buffers. Each pair is managed by a DoubleBuffer + //! structure that indicates which of the two buffers is "current" (and thus + //! contains the input data to be sorted). + //! - The contents of both buffers within each pair may be altered by the + //! sorting operation. + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the number + //! of key bits specified and the targeted device architecture). + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - StableSortPairs is stable: it preserves the relative ordering + //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that + //! ``x`` precedes `y`, and if the two elements are equivalent (neither + //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that + //! ``x`` still precedes ``y``. + //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` + //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range + //! ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, + //! ``d_values.Alternate()[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys with associated vector of + //! ``i`` nt values. + //! + //! .. code-block:: c++ + //! + //! #include + //! // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a set of DoubleBuffers to wrap pairs of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::StableSortPairs( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::StableSortPairs( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer contains + //! the unsorted input values and, upon return, is updated to point to the + //! sorted output values + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template - * // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedSort::StableSortPairsDescending( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedSort::StableSortPairsDescending( - * d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] - * @endcode - * - * @tparam KeyT - * [inferred] Key type - * - * @tparam ValueT - * [inferred] Value type - * - * @tparam BeginOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * beginning offsets \iterator - * - * @tparam EndOffsetIteratorT - * [inferred] Random-access input iterator type for reading segment - * ending offsets \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to @p temp_storage_bytes and no work - * is done - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to - * point to the sorted output keys - * - * @param[in,out] d_values - * Double-buffer of values whose "current" device-accessible buffer contains - * the unsorted input values and, upon return, is updated to point to the - * sorted output values - * - * @param[in] num_items - * The total number of items to sort (across all segments) - * - * @param[in] num_segments - * The number of segments that comprise the sorting data - * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length @p num_segments, such that `d_begin_offsets[i]` is the first - * element of the ith data segment in `d_keys_*` and - * `d_values_*` - * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of - * the ith data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is - * considered empty. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is - * stream0. - */ + //! @rst + //! Sorts segments of key-value pairs into descending order. + //! Approximately ``2 * num_segments`` auxiliary storage required. + //! + //! - The sorting operation is given a pair of key buffers and a corresponding + //! pair of associated value buffers. Each pair is managed by a DoubleBuffer + //! structure that indicates which of the two buffers is "current" (and thus + //! contains the input data to be sorted). + //! - The contents of both buffers within each pair may be altered by the sorting + //! operation. + //! - Upon completion, the sorting operation will update the "current" indicator + //! within each DoubleBuffer wrapper to reference which of the two buffers + //! now contains the sorted output sequence (a function of the number of key bits + //! specified and the targeted device architecture). + //! - When the input is a contiguous sequence of segments, a single sequence + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where + //! the latter is specified as ``segment_offsets + 1``). + //! - StableSortPairsDescending is stable: it preserves the relative ordering + //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that + //! ``x`` precedes ``y``, and if the two elements are equivalent (neither + //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that + //! ``x`` still precedes ``y``. + //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` + //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range + //! ``[cur, cur + num_items)`` shall not overlap + //! ``[alt, alt + num_items)``. Both ranges shall not overlap + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor + //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. + //! - Segments are not required to be contiguous. For all index values ``i`` + //! outside the specified segments ``d_keys.Current()[i]``, + //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, + //! ``d_values.Alternate()[i]`` will not be accessed nor modified. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the batched sorting of three segments + //! (with one zero-length segment) of ``i`` nt keys with associated vector of + //! ``i`` nt values. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for sorting data + //! int num_items; // e.g., 7 + //! int num_segments; // e.g., 3 + //! int *d_offsets; // e.g., [0, 3, 3, 7] + //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + //! ... + //! + //! // Create a set of DoubleBuffers to wrap pairs of device pointers + //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSegmentedSort::StableSortPairsDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run sorting operation + //! cub::DeviceSegmentedSort::StableSortPairsDescending( + //! d_temp_storage, temp_storage_bytes, d_keys, d_values, + //! num_items, num_segments, d_offsets, d_offsets + 1); + //! + //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] + //! + //! @endrst + //! + //! @tparam KeyT + //! **[inferred]** Key type + //! + //! @tparam ValueT + //! **[inferred]** Value type + //! + //! @tparam BeginOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! beginning offsets @iterator + //! + //! @tparam EndOffsetIteratorT + //! **[inferred]** Random-access input iterator type for reading segment + //! ending offsets @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work + //! is done + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to + //! point to the sorted output keys + //! + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer contains + //! the unsorted input values and, upon return, is updated to point to the + //! sorted output values + //! + //! @param[in] num_items + //! The total number of items to sort (across all segments) + //! + //! @param[in] num_segments + //! The number of segments that comprise the sorting data + //! + //! @param[in] d_begin_offsets + //! @rst + //! Random-access input iterator to the sequence of beginning offsets of + //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first + //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` + //! @endrst + //! + //! @param[in] d_end_offsets + //! @rst + //! Random-access input iterator to the sequence of ending offsets of length + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of + //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is + //! considered empty. + //! @endrst + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input, - * // flags, and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::Flagged( - * d_temp_storage, temp_storage_bytes, - * d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::Flagged( - * d_temp_storage, temp_storage_bytes, - * d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // d_out <-- [1, 4, 6, 7] - * // d_num_selected_out <-- [4] - * - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam FlagIterator - * **[inferred]** Random-access input iterator type for reading selection - * flags \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected - * items \iterator - * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items - * selected \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[in] d_flags - * Pointer to the input sequence of selection flags - * - * @param[out] d_out - * Pointer to the output sequence of selected data items - * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected - * (i.e., length of `d_out`) - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Uses the ``d_flags`` sequence to selectively copy the corresponding items from ``d_in`` into ``d_out``. + //! The total number of items selected is written to ``d_num_selected_out``. + //! + //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.). + //! - Copies of the selected items are compacted into ``d_out`` and maintain their original relative ordering. + //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap ``[d_in, d_in + num_items)``, + //! | ``[d_flags, d_flags + num_items)`` nor ``d_num_selected_out`` in any way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for input, + //! // flags, and output + //! int num_items; // e.g., 8 + //! int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + //! char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + //! int *d_out; // e.g., [ , , , , , , , ] + //! int *d_num_selected_out; // e.g., [ ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSelect::Flagged( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_flags, d_out, d_num_selected_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DeviceSelect::Flagged( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_flags, d_out, d_num_selected_out, num_items); + //! + //! // d_out <-- [1, 4, 6, 7] + //! // d_num_selected_out <-- [4] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam FlagIterator + //! **[inferred]** Random-access input iterator type for reading selection flags @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing selected items @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items selected @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[in] d_flags + //! Pointer to the input sequence of selection flags + //! + //! @param[out] d_out + //! Pointer to the output sequence of selected data items + //! + //! @param[out] d_num_selected_out + //! Pointer to the output total number of items selected (i.e., length of `d_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input, - * // flags, and output - * int num_items; // e.g., 8 - * int *d_data; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::Flagged( - * d_temp_storage, temp_storage_bytes, - * d_in, d_flags, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::Flagged( - * d_temp_storage, temp_storage_bytes, - * d_in, d_flags, d_num_selected_out, num_items); - * - * // d_data <-- [1, 4, 6, 7] - * // d_num_selected_out <-- [4] - * - * @endcode - * - * @tparam IteratorT - * **[inferred]** Random-access iterator type for reading and writing - * selected items \iterator - * - * @tparam FlagIterator - * **[inferred]** Random-access input iterator type for reading selection - * flags \iterator - * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items - * selected \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_data - * Pointer to the sequence of data items - * - * @param[in] d_flags - * Pointer to the input sequence of selection flags - * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_data`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``. + //! The total number of items selected is written to ``d_num_selected_out``. + //! + //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.). + //! - Copies of the selected items are compacted in-place and maintain their original relative ordering. + //! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap + //! | ``[d_flags, d_flags + num_items)`` in any other way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for input, + //! // flags, and output + //! int num_items; // e.g., 8 + //! int *d_data; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + //! char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + //! int *d_num_selected_out; // e.g., [ ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSelect::Flagged( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_flags, d_num_selected_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DeviceSelect::Flagged( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_flags, d_num_selected_out, num_items); + //! + //! // d_data <-- [1, 4, 6, 7] + //! // d_num_selected_out <-- [4] + //! + //! @endrst + //! + //! @tparam IteratorT + //! **[inferred]** Random-access iterator type for reading and writing selected items @iterator + //! + //! @tparam FlagIterator + //! **[inferred]** Random-access input iterator type for reading selection flags @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items selected @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_data + //! Pointer to the sequence of data items + //! + //! @param[in] d_flags + //! Pointer to the input sequence of selection flags + //! + //! @param[out] d_num_selected_out + //! Pointer to the output total number of items selected + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_data`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -403,125 +373,107 @@ struct DeviceSelect stream); } - /** - * @brief Uses the `select_op` functor to selectively copy items from `d_in` - * into `d_out`. The total number of items selected is written to - * `d_num_selected_out`. ![](select_logo.png) - * - * @par - * - Copies of the selected items are compacted into `d_out` and maintain - * their original relative ordering. - * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap - * `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way. - * - @devicestorage - * - * @par Performance - * The following charts illustrate saturated select-if performance across - * different CUDA architectures for `int32` and `int64` items, respectively. - * Items are selected with 50% probability. - * - * @image html select_if_int32_50_percent.png - * @image html select_if_int64_50_percent.png - * - * @par - * The following charts are similar, but 5% selection probability: - * - * @image html select_if_int32_5_percent.png - * @image html select_if_int64_5_percent.png - * - * @par Snippet - * The code snippet below illustrates the compaction of items selected from - * an `int` device vector. - * @par - * @code - * #include // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const { - * return (a < compare); - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * LessThan select_op(7); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::If( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::If( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // d_out <-- [0, 2, 3, 5, 2] - * // d_num_selected_out <-- [5] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected - * items \iterator - * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items - * selected \iterator - * - * @tparam SelectOp - * **[inferred]** Selection operator type having member - * `bool operator()(const T &a)` - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output sequence of selected data items - * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected - * (i.e., length of `d_out`) - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_in`) - * - * @param[in] select_op - * Unary selection operator - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``. + //! The total number of items selected is written to ``d_num_selected_out``. + //! + //! - Copies of the selected items are compacted into ``d_out`` and maintain + //! their original relative ordering. + //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap + //! | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Functor type for selecting values less than some criteria + //! struct LessThan + //! { + //! int compare; + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! LessThan(int compare) : compare(compare) {} + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! bool operator()(const int &a) const { + //! return (a < compare); + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 8 + //! int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + //! int *d_out; // e.g., [ , , , , , , , ] + //! int *d_num_selected_out; // e.g., [ ] + //! LessThan select_op(7); + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSelect::If( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, d_num_selected_out, num_items, select_op); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DeviceSelect::If( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, d_num_selected_out, num_items, select_op); + //! + //! // d_out <-- [0, 2, 3, 5, 2] + //! // d_num_selected_out <-- [5] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing selected items @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items selected @iterator + //! + //! @tparam SelectOp + //! **[inferred]** Selection operator type having member `bool operator()(const T &a)` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output sequence of selected data items + //! + //! @param[out] d_num_selected_out + //! Pointer to the output total number of items selected + //! (i.e., length of `d_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] select_op + //! Unary selection operator + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const { - * return (a < compare); - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 8 - * int *d_data; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_num_selected_out; // e.g., [ ] - * LessThan select_op(7); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::If( - * d_temp_storage, temp_storage_bytes, - * d_data, d_num_selected_out, num_items, select_op); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::If( - * d_temp_storage, temp_storage_bytes, - * d_data, d_num_selected_out, num_items, select_op); - * - * // d_data <-- [0, 2, 3, 5, 2] - * // d_num_selected_out <-- [5] - * @endcode - * - * @tparam IteratorT - * **[inferred]** Random-access input iterator type for reading and - * writing items \iterator - * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items - * selected \iterator - * - * @tparam SelectOp - * **[inferred]** Selection operator type having member - * `bool operator()(const T &a)` - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in,out] d_data - * Pointer to the sequence of data items - * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_data`) - * - * @param[in] select_op - * Unary selection operator - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Uses the ``select_op`` functor to selectively compact items in ``d_data``. + //! The total number of items selected is written to ``d_num_selected_out``. + //! + //! - | Copies of the selected items are compacted in ``d_data`` and maintain + //! | their original relative ordering. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Functor type for selecting values less than some criteria + //! struct LessThan + //! { + //! int compare; + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! LessThan(int compare) : compare(compare) {} + //! + //! CUB_RUNTIME_FUNCTION __forceinline__ + //! bool operator()(const int &a) const { + //! return (a < compare); + //! } + //! }; + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 8 + //! int *d_data; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + //! int *d_num_selected_out; // e.g., [ ] + //! LessThan select_op(7); + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSelect::If( + //! d_temp_storage, temp_storage_bytes, + //! d_data, d_num_selected_out, num_items, select_op); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DeviceSelect::If( + //! d_temp_storage, temp_storage_bytes, + //! d_data, d_num_selected_out, num_items, select_op); + //! + //! // d_data <-- [0, 2, 3, 5, 2] + //! // d_num_selected_out <-- [5] + //! + //! @endrst + //! + //! @tparam IteratorT + //! **[inferred]** Random-access input iterator type for reading and writing items @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items selected @iterator + //! + //! @tparam SelectOp + //! **[inferred]** Selection operator type having member `bool operator()(const T &a)` + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in,out] d_data + //! Pointer to the sequence of data items + //! + //! @param[out] d_num_selected_out + //! Pointer to the output total number of items selected + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_data`) + //! + //! @param[in] select_op + //! Unary selection operator + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -745,107 +694,87 @@ struct DeviceSelect stream); } - /** - * @brief Given an input sequence `d_in` having runs of consecutive - * equal-valued keys, only the first key from each run is selectively - * copied to `d_out`. The total number of items selected is written to - * `d_num_selected_out`. ![](unique_logo.png) - * - * @par - * - The `==` equality operator is used to determine whether keys are - * equivalent - * - Copies of the selected items are compacted into `d_out` and maintain - * their original relative ordering. - * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap - * `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way. - * - @devicestorage - * - * @par Performance - * The following charts illustrate saturated select-unique performance across different - * CUDA architectures for `int32` and `int64` items, respectively. Segments - * have lengths uniformly sampled from `[1, 1000]`. - * - * @image html select_unique_int32_len_500.png - * @image html select_unique_int64_len_500.png - * - * @par - * The following charts are similar, but with segment lengths uniformly - * sampled from `[1, 10]`: - * - * @image html select_unique_int32_len_5.png - * @image html select_unique_int64_len_5.png - * - * @par Snippet - * The code snippet below illustrates the compaction of items selected from - * an `int` device vector. - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::Unique( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::Unique( - * d_temp_storage, temp_storage_bytes, - * d_in, d_out, d_num_selected_out, num_items); - * - * // d_out <-- [0, 2, 9, 5, 8] - * // d_num_selected_out <-- [5] - * @endcode - * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * items \iterator - * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected - * items \iterator - * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items - * selected \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_in - * Pointer to the input sequence of data items - * - * @param[out] d_out - * Pointer to the output sequence of selected data items - * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected - * (i.e., length of `d_out`) - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Given an input sequence ``d_in`` having runs of consecutive equal-valued keys, + //! only the first key from each run is selectively copied to ``d_out``. + //! The total number of items selected is written to ``d_num_selected_out``. + //! + //! - The ``==`` equality operator is used to determine whether keys are equivalent + //! - Copies of the selected items are compacted into ``d_out`` and maintain their original relative ordering. + //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap + //! | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way. + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 8 + //! int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + //! int *d_out; // e.g., [ , , , , , , , ] + //! int *d_num_selected_out; // e.g., [ ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSelect::Unique( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, d_num_selected_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DeviceSelect::Unique( + //! d_temp_storage, temp_storage_bytes, + //! d_in, d_out, d_num_selected_out, num_items); + //! + //! // d_out <-- [0, 2, 9, 5, 8] + //! // d_num_selected_out <-- [5] + //! + //! @endrst + //! + //! @tparam InputIteratorT + //! **[inferred]** Random-access input iterator type for reading input items @iterator + //! + //! @tparam OutputIteratorT + //! **[inferred]** Random-access output iterator type for writing selected items @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items selected @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_in + //! Pointer to the input sequence of data items + //! + //! @param[out] d_out + //! Pointer to the output sequence of selected data items + //! + //! @param[out] d_num_selected_out + //! Pointer to the output total number of items selected + //! (i.e., length of `d_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template @@ -908,118 +837,112 @@ struct DeviceSelect stream); } - /** - * @brief Given an input sequence `d_keys_in` and `d_values_in` with runs of - * key-value pairs with consecutive equal-valued keys, only the first - * key and its value from each run is selectively copied to - * `d_keys_out` and `d_values_out`. The total number of items selected - * is written to `d_num_selected_out`. ![](unique_logo.png) - * - * @par - * - The `==` equality operator is used to determine whether keys are - * equivalent - * - Copies of the selected items are compacted into `d_out` and maintain - * their original relative ordering. - * - In-place operations are not supported. There must be no overlap between - * any of the provided ranges: - * - `[d_keys_in, d_keys_in + num_items)` - * - `[d_keys_out, d_keys_out + *d_num_selected_out)` - * - `[d_values_in, d_values_in + num_items)` - * - `[d_values_out, d_values_out + *d_num_selected_out)` - * - `[d_num_selected_out, d_num_selected_out + 1)` - * - @devicestorage - * - * @par Snippet - * The code snippet below illustrates the compaction of items selected from - * an `int` device vector. - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers - * // for input and output - * int num_items; // e.g., 8 - * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * int *d_keys_out; // e.g., [ , , , , , , , ] - * int *d_values_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::UniqueByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, - * d_keys_out, d_values_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::UniqueByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, - * d_keys_out, d_values_out, d_num_selected_out, num_items); - * - * // d_keys_out <-- [0, 2, 9, 5, 8] - * // d_values_out <-- [1, 2, 4, 5, 8] - * // d_num_selected_out <-- [5] - * @endcode - * - * @tparam KeyInputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * keys \iterator - * - * @tparam ValueInputIteratorT - * **[inferred]** Random-access input iterator type for reading input - * values \iterator - * - * @tparam KeyOutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected - * keys \iterator - * - * @tparam ValueOutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected - * values \iterator - * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items - * selected \iterator - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work - * is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in - * Pointer to the input sequence of keys - * - * @param[in] d_values_in - * Pointer to the input sequence of values - * - * @param[out] d_keys_out - * Pointer to the output sequence of selected keys - * - * @param[out] d_values_out - * Pointer to the output sequence of selected values - * - * @param[out] d_num_selected_out - * Pointer to the total number of items selected (i.e., length of - * `d_keys_out` or `d_values_out`) - * - * @param[in] num_items - * Total number of input items (i.e., length of `d_keys_in` or - * `d_values_in`) - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. - * Default is stream0. - */ + //! @rst + //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive + //! equal-valued keys, only the first key and its value from each run is selectively copied + //! to ``d_keys_out`` and ``d_values_out``. + //! The total number of items selected is written to ``d_num_selected_out``. + //! + //! - The ``==`` equality operator is used to determine whether keys are equivalent + //! - Copies of the selected items are compacted into ``d_out`` and maintain + //! their original relative ordering. + //! - In-place operations are not supported. There must be no overlap between + //! any of the provided ranges: + //! + //! - ``[d_keys_in, d_keys_in + num_items)`` + //! - ``[d_keys_out, d_keys_out + *d_num_selected_out)`` + //! - ``[d_values_in, d_values_in + num_items)`` + //! - ``[d_values_out, d_values_out + *d_num_selected_out)`` + //! - ``[d_num_selected_out, d_num_selected_out + 1)`` + //! + //! - @devicestorage + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers + //! // for input and output + //! int num_items; // e.g., 8 + //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + //! int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + //! int *d_keys_out; // e.g., [ , , , , , , , ] + //! int *d_values_out; // e.g., [ , , , , , , , ] + //! int *d_num_selected_out; // e.g., [ ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void *d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSelect::UniqueByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, + //! d_keys_out, d_values_out, d_num_selected_out, num_items); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run selection + //! cub::DeviceSelect::UniqueByKey( + //! d_temp_storage, temp_storage_bytes, + //! d_keys_in, d_values_in, + //! d_keys_out, d_values_out, d_num_selected_out, num_items); + //! + //! // d_keys_out <-- [0, 2, 9, 5, 8] + //! // d_values_out <-- [1, 2, 4, 5, 8] + //! // d_num_selected_out <-- [5] + //! + //! @endrst + //! + //! @tparam KeyInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input keys @iterator + //! + //! @tparam ValueInputIteratorT + //! **[inferred]** Random-access input iterator type for reading input values @iterator + //! + //! @tparam KeyOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing selected keys @iterator + //! + //! @tparam ValueOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing selected values @iterator + //! + //! @tparam NumSelectedIteratorT + //! **[inferred]** Output iterator type for recording the number of items selected @iterator + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_keys_in + //! Pointer to the input sequence of keys + //! + //! @param[in] d_values_in + //! Pointer to the input sequence of values + //! + //! @param[out] d_keys_out + //! Pointer to the output sequence of selected keys + //! + //! @param[out] d_values_out + //! Pointer to the output sequence of selected values + //! + //! @param[out] d_num_selected_out + //! Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., length of `d_keys_in` or `d_values_in`) + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) - * performs the matrix-vector operation - * y = A*x + y, - * where: - * - A is an mxn sparse matrix whose non-zero structure is specified in - * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29) - * (i.e., three arrays: values, row_offsets, and column_indices) - * - x and y are dense vectors - * - * @par Usage Considerations - * @cdp_class{DeviceSpmv} - * - */ +//! @rst +//! DeviceSpmv provides device-wide parallel operations for performing +//! sparse-matrix * dense-vector multiplication (SpMV). +//! +//! Overview +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! The `SpMV computation `_ +//! performs the matrix-vector operation ``y = A * x + y``, where: +//! +//! - ``A`` is an ``m * n`` sparse matrix whose non-zero structure is specified in +//! `compressed-storage-row (CSR) format `_ +//! (i.e., three arrays: ``values``, ``row_offsets``, and ``column_indices``) +//! - ``x`` and ``y`` are dense vectors +//! +//! Usage Considerations +//! +++++++++++++++++++++++++++++++++++++++++++++ +//! +//! @cdp_class{DeviceSpmv} +//! +//! @endrst struct DeviceSpmv { - /******************************************************************//** - * @name CSR matrix operations - *********************************************************************/ - //@{ - - /** - * @brief This function performs the matrix-vector operation - * y = A*x. - * - * @par Snippet - * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A - * representing a 3x3 lattice (24 non-zeros). - * - * @par - * @code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input - * vector x, - * // and output vector y - * int num_rows = 9; - * int num_cols = 9; - * int num_nonzeros = 24; - * - * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, - * // 1, 1, 1, 1, 1, 1, 1, 1, - * // 1, 1, 1, 1, 1, 1, 1, 1] - * - * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, - * // 4, 6, 1, 3, 5, 7, 2, 4, - * // 8, 3, 7, 4, 6, 8, 5, 7] - * - * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] - * - * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] - * float* d_vector_y; // e.g., [ , , , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - * num_rows, num_cols, num_nonzeros); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run SpMV - * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - * num_rows, num_cols, num_nonzeros); - * - * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] - * - * @endcode - * - * @tparam ValueT - * [inferred] Matrix and vector value type (e.g., @p float, @p double, etc.) - * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. - * When NULL, the required allocation size is written to @p temp_storage_bytes - * and no work is done. - * - * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation - * - * @param[in] d_values - * Pointer to the array of @p num_nonzeros values of the corresponding nonzero elements - * of matrix A. - * - * @param[in] d_row_offsets - * Pointer to the array of @p m + 1 offsets demarcating the start of every row in - * @p d_column_indices and @p d_values (with the final entry being equal to @p num_nonzeros) - * - * @param[in] d_column_indices - * Pointer to the array of @p num_nonzeros column-indices of the corresponding nonzero - * elements of matrix A. (Indices are zero-valued.) - * - * @param[in] d_vector_x - * Pointer to the array of @p num_cols values corresponding to the dense input vector - * x - * - * @param[out] d_vector_y - * Pointer to the array of @p num_rows values corresponding to the dense output vector - * y - * - * @param[in] num_rows - * number of rows of matrix A. - * - * @param[in] num_cols - * number of columns of matrix A. - * - * @param[in] num_nonzeros - * number of nonzero elements of matrix A. - * - * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is stream0. - */ + //! @name CSR matrix operations + //! @{ + + //! @rst + //! This function performs the matrix-vector operation ``y = A*x``. + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! The code snippet below illustrates SpMV upon a 9x9 CSR matrix ``A`` representing a 3x3 lattice (24 non-zeros). + //! + //! .. code-block:: c++ + //! + //! #include // or equivalently + //! + //! // Declare, allocate, and initialize device-accessible pointers for input matrix A, input + //! vector x, + //! // and output vector y + //! int num_rows = 9; + //! int num_cols = 9; + //! int num_nonzeros = 24; + //! + //! float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, + //! // 1, 1, 1, 1, 1, 1, 1, 1, + //! // 1, 1, 1, 1, 1, 1, 1, 1] + //! + //! int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, + //! // 4, 6, 1, 3, 5, 7, 2, 4, + //! // 8, 3, 7, 4, 6, 8, 5, 7] + //! + //! int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] + //! + //! float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] + //! float* d_vector_y; // e.g., [ , , , , , , , , ] + //! ... + //! + //! // Determine temporary device storage requirements + //! void* d_temp_storage = NULL; + //! size_t temp_storage_bytes = 0; + //! cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + //! d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + //! num_rows, num_cols, num_nonzeros); + //! + //! // Allocate temporary storage + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); + //! + //! // Run SpMV + //! cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + //! d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + //! num_rows, num_cols, num_nonzeros); + //! + //! // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] + //! + //! @endrst + //! + //! @tparam ValueT + //! **[inferred]** Matrix and vector value type (e.g., `float`, `double`, etc.) + //! + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. + //! When NULL, the required allocation size is written to `temp_storage_bytes` and no work is done. + //! + //! @param[in,out] temp_storage_bytes + //! Reference to size in bytes of `d_temp_storage` allocation + //! + //! @param[in] d_values + //! Pointer to the array of `num_nonzeros` values of the corresponding nonzero elements + //! of matrix `A`. + //! + //! @param[in] d_row_offsets + //! Pointer to the array of `m + 1` offsets demarcating the start of every row in + //! `d_column_indices` and `d_values` (with the final entry being equal to `num_nonzeros`) + //! + //! @param[in] d_column_indices + //! Pointer to the array of `num_nonzeros` column-indices of the corresponding nonzero + //! elements of matrix `A`. (Indices are zero-valued.) + //! + //! @param[in] d_vector_x + //! Pointer to the array of `num_cols` values corresponding to the dense input vector `x` + //! + //! @param[out] d_vector_y + //! Pointer to the array of `num_rows` values corresponding to the dense output vector `y` + //! + //! @param[in] num_rows + //! number of rows of matrix `A`. + //! + //! @param[in] num_cols + //! number of columns of matrix `A`. + //! + //! @param[in] num_nonzeros + //! number of nonzero elements of matrix `A`. + //! + //! @param[in] stream + //! @rst + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(void *d_temp_storage, size_t &temp_storage_bytes, @@ -240,7 +234,7 @@ struct DeviceSpmv stream); } - //@} end member group + //! @} end member group }; diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 16bd05db81..5a76e73e5e 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -339,11 +339,11 @@ struct DeviceBatchMemcpyPolicy }; /** - * @tparam InputBufferIt [inferred] Random-access input iterator type providing the pointers + * @tparam InputBufferIt **[inferred]** Random-access input iterator type providing the pointers * to the source memory buffers - * @tparam OutputBufferIt [inferred] Random-access input iterator type providing the pointers + * @tparam OutputBufferIt **[inferred]** Random-access input iterator type providing the pointers * to the destination memory buffers - * @tparam BufferSizeIteratorT [inferred] Random-access input iterator type providing the + * @tparam BufferSizeIteratorT **[inferred]** Random-access input iterator type providing the * number of bytes to be copied for each pair of buffers * @tparam BufferOffsetT Integer type large enough to hold any offset in [0, num_buffers) * @tparam BlockOffsetT Integer type large enough to hold any offset in [0, diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 6678c2cf87..79e48a8ab3 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -139,7 +139,7 @@ DeviceHistogramInitKernel(ArrayWrapper num_output_bins * Number of channels actively being histogrammed * * @tparam SampleIteratorT - * The input iterator type. \iterator. + * The input iterator type. @iterator. * * @tparam CounterT * Integer type for counting sample occurrences per histogram bin @@ -565,7 +565,7 @@ struct dispatch_histogram * Number of channels actively being histogrammed * * @tparam SampleIteratorT - * Random-access input iterator type for reading input items \iterator + * Random-access input iterator type for reading input items @iterator * * @tparam CounterT * Integer type for counting sample occurrences per histogram bin diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index fe0554c932..07e99da614 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -515,10 +515,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE * Value type * * @tparam BeginOffsetIteratorT - * Random-access input iterator type for reading segment beginning offsets \iterator + * Random-access input iterator type for reading segment beginning offsets @iterator * * @tparam EndOffsetIteratorT - * Random-access input iterator type for reading segment ending offsets \iterator + * Random-access input iterator type for reading segment ending offsets @iterator * * @tparam OffsetT * Signed integer type for global offsets @@ -536,12 +536,12 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE * Output values buffer * * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of length @p num_segments, + * Random-access input iterator to the sequence of beginning offsets of length `num_segments`, * such that d_begin_offsets[i] is the first element of the ith * data segment in d_keys_* and d_values_* * * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length @p num_segments, + * Random-access input iterator to the sequence of ending offsets of length `num_segments`, * such that d_end_offsets[i]-1 is the last element of the ith * data segment in d_keys_* and d_values_*. * If d_end_offsets[i]-1 <= d_begin_offsets[i], @@ -1380,11 +1380,11 @@ struct DispatchRadixSort : SelectedPolicy //------------------------------------------------------------------------------ /// Device-accessible allocation of temporary storage. - // When NULL, the required allocation size is written to @p temp_storage_bytes and no work is + // When NULL, the required allocation size is written to `temp_storage_bytes` and no work is // done. void *d_temp_storage; - /// Reference to size in bytes of @p d_temp_storage allocation + /// Reference to size in bytes of `d_temp_storage` allocation size_t &temp_storage_bytes; /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is @@ -2343,10 +2343,10 @@ struct DispatchRadixSort : SelectedPolicy * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When NULL, the required - * allocation size is written to @p temp_storage_bytes and no work is done. + * allocation size is written to `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation + * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Double-buffer whose current buffer contains the unsorted input keys and, @@ -2467,10 +2467,10 @@ struct DispatchRadixSort : SelectedPolicy * Value type * * @tparam BeginOffsetIteratorT - * Random-access input iterator type for reading segment beginning offsets \iterator + * Random-access input iterator type for reading segment beginning offsets @iterator * * @tparam EndOffsetIteratorT - * Random-access input iterator type for reading segment ending offsets \iterator + * Random-access input iterator type for reading segment ending offsets @iterator * * @tparam OffsetT * Signed integer type for global offsets @@ -2497,10 +2497,10 @@ struct DispatchSegmentedRadixSort : SelectedPolicy //------------------------------------------------------------------------------ /// Device-accessible allocation of temporary storage. When NULL, the required allocation size - /// is written to @p temp_storage_bytes and no work is done. + /// is written to `temp_storage_bytes` and no work is done. void *d_temp_storage; - /// Reference to size in bytes of @p d_temp_storage allocation + /// Reference to size in bytes of `d_temp_storage` allocation size_t &temp_storage_bytes; /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is @@ -2517,12 +2517,12 @@ struct DispatchSegmentedRadixSort : SelectedPolicy /// The number of segments that comprise the sorting data OffsetT num_segments; - /// Random-access input iterator to the sequence of beginning offsets of length @p num_segments, + /// Random-access input iterator to the sequence of beginning offsets of length `num_segments`, /// such that d_begin_offsets[i] is the first element of the ith /// data segment in d_keys_* and d_values_* BeginOffsetIteratorT d_begin_offsets; - /// Random-access input iterator to the sequence of ending offsets of length @p num_segments, + /// Random-access input iterator to the sequence of ending offsets of length `num_segments`, /// such that d_end_offsets[i]-1 is the last element of the ith /// data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 /// <= d_begin_offsets[i], the ith is considered empty. @@ -2862,10 +2862,10 @@ struct DispatchSegmentedRadixSort : SelectedPolicy * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When NULL, the required allocation size - * is written to @p temp_storage_bytes and no work is done. + * is written to `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation + * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Double-buffer whose current buffer contains the unsorted input keys and, upon return, is @@ -2883,11 +2883,11 @@ struct DispatchSegmentedRadixSort : SelectedPolicy * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length - * @p num_segments, such that d_begin_offsets[i] is the first element of the + * `num_segments`, such that d_begin_offsets[i] is the first element of the * ith data segment in d_keys_* and d_values_* * * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length @p num_segments, + * Random-access input iterator to the sequence of ending offsets of length `num_segments`, * such that d_end_offsets[i]-1 is the last element of the ith * data segment in d_keys_* and d_values_*. * If d_end_offsets[i]-1 <= d_begin_offsets[i], diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 553d52b0a5..e429c2edbb 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -129,7 +129,7 @@ __host__ __device__ void finalize_and_store_aggregate(OutputIteratorT d_out, * Chained tuning policy * * @tparam InputIteratorT - * Random-access input iterator type for reading input items \iterator + * Random-access input iterator type for reading input items @iterator * * @tparam OffsetT * Signed integer type for global offsets @@ -205,10 +205,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy: * Chained tuning policy * * @tparam InputIteratorT - * Random-access input iterator type for reading input items \iterator + * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT - * Output iterator type for recording the reduced aggregate \iterator + * Output iterator type for recording the reduced aggregate @iterator * * @tparam OffsetT * Signed integer type for global offsets @@ -311,18 +311,18 @@ NormalizeReductionOutput(KeyValuePairT &val, * Chained tuning policy * * @tparam InputIteratorT - * Random-access input iterator type for reading input items \iterator + * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT - * Output iterator type for recording the reduced aggregate \iterator + * Output iterator type for recording the reduced aggregate @iterator * * @tparam BeginOffsetIteratorT * Random-access input iterator type for reading segment beginning offsets - * \iterator + * @iterator * * @tparam EndOffsetIteratorT * Random-access input iterator type for reading segment ending offsets - * \iterator + * @iterator * * @tparam OffsetT * Signed integer type for global offsets @@ -520,10 +520,10 @@ struct DeviceReducePolicy * device-wide reduction * * @tparam InputIteratorT - * Random-access input iterator type for reading input items \iterator + * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT - * Output iterator type for recording the reduced aggregate \iterator + * Output iterator type for recording the reduced aggregate @iterator * * @tparam OffsetT * Signed integer type for global offsets @@ -1016,10 +1016,10 @@ struct DispatchReduce : SelectedPolicy * device-wide transpose reduce * * @tparam InputIteratorT - * Random-access input iterator type for reading input items \iterator + * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT - * Output iterator type for recording the reduced aggregate \iterator + * Output iterator type for recording the reduced aggregate @iterator * * @tparam OffsetT * Signed integer type for global offsets @@ -1058,18 +1058,18 @@ using DispatchTransformReduce = * device-wide reduction * * @tparam InputIteratorT - * Random-access input iterator type for reading input items \iterator + * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT - * Output iterator type for recording the reduced aggregate \iterator + * Output iterator type for recording the reduced aggregate @iterator * * @tparam BeginOffsetIteratorT * Random-access input iterator type for reading segment beginning offsets - * \iterator + * @iterator * * @tparam EndOffsetIteratorT * Random-access input iterator type for reading segment ending offsets - * \iterator + * @iterator * * @tparam OffsetT * Signed integer type for global offsets diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 8717792ee8..8ad76e591a 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -76,16 +76,16 @@ CUB_NAMESPACE_BEGIN * Parameterized AgentRlePolicyT tuning policy type * * @tparam InputIteratorT - * Random-access input iterator type for reading input items \iterator + * Random-access input iterator type for reading input items @iterator * * @tparam OffsetsOutputIteratorT - * Random-access output iterator type for writing run-offset values \iterator + * Random-access output iterator type for writing run-offset values @iterator * * @tparam LengthsOutputIteratorT - * Random-access output iterator type for writing run-length values \iterator + * Random-access output iterator type for writing run-length values @iterator * * @tparam NumRunsOutputIteratorT - * Output iterator type for recording the number of runs encountered \iterator + * Output iterator type for recording the number of runs encountered @iterator * * @tparam ScanTileStateT * Tile status interface type @@ -164,16 +164,16 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA * Utility class for dispatching the appropriately-tuned kernels for DeviceRle * * @tparam InputIteratorT - * Random-access input iterator type for reading input items \iterator + * Random-access input iterator type for reading input items @iterator * * @tparam OffsetsOutputIteratorT - * Random-access output iterator type for writing run-offset values \iterator + * Random-access output iterator type for writing run-offset values @iterator * * @tparam LengthsOutputIteratorT - * Random-access output iterator type for writing run-length values \iterator + * Random-access output iterator type for writing run-length values @iterator * * @tparam NumRunsOutputIteratorT - * Output iterator type for recording the number of runs encountered \iterator + * Output iterator type for recording the number of runs encountered @iterator * * @tparam EqualityOpT * T equality operator type @@ -502,7 +502,7 @@ struct DeviceRleDispatch * Total number of input items (i.e., length of `d_in`) * * @param stream - * [optional] CUDA stream to launch kernels within. + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 9f4e1bc29d..11a6324794 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -125,10 +125,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel(ScanTileStateT tile_st * Chained tuning policy * * @tparam InputIteratorT - * Random-access input iterator type for reading scan inputs \iterator + * Random-access input iterator type for reading scan inputs @iterator * * @tparam OutputIteratorT - * Random-access output iterator type for writing scan outputs \iterator + * Random-access output iterator type for writing scan outputs @iterator * * @tparam ScanTileStateT * Tile status interface type @@ -214,10 +214,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) * DeviceScan * * @tparam InputIteratorT - * Random-access input iterator type for reading scan inputs \iterator + * Random-access input iterator type for reading scan inputs @iterator * * @tparam OutputIteratorT - * Random-access output iterator type for writing scan outputs \iterator + * Random-access output iterator type for writing scan outputs @iterator * * @tparam ScanOpT * Binary scan functor type having member diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index f4dfae06ef..1f87e99c79 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -96,12 +96,12 @@ CUB_NAMESPACE_BEGIN * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length - * @p num_segments, such that `d_begin_offsets[i]` is the first element of the + * `num_segments`, such that `d_begin_offsets[i]` is the first element of the * i-th data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of the + * `num_segments`, such that `d_end_offsets[i]-1` is the last element of the * i-th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. @@ -291,12 +291,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length - * @p num_segments, such that `d_begin_offsets[i]` is the first element of the + * `num_segments`, such that `d_begin_offsets[i]` is the first element of the * ith data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of the + * `num_segments`, such that `d_end_offsets[i]-1` is the last element of the * ith data segment in `d_keys_*` and `d_values_*`. If * `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the ith is * considered empty. @@ -420,12 +420,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length - * @p num_segments, such that `d_begin_offsets[i]` is the first element of the + * `num_segments`, such that `d_begin_offsets[i]` is the first element of the * ith data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that `d_end_offsets[i]-1` is the last element of the + * `num_segments`, such that `d_end_offsets[i]-1` is the last element of the * ith data segment in `d_keys_*` and `d_values_*`. If * `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the ith is * considered empty. @@ -1151,12 +1151,12 @@ struct DispatchSegmentedSort : SelectedPolicy /** * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to @p temp_storage_bytes and no work + * required allocation size is written to `temp_storage_bytes` and no work * is done. */ void *d_temp_storage; - /// Reference to size in bytes of @p d_temp_storage allocation + /// Reference to size in bytes of `d_temp_storage` allocation std::size_t &temp_storage_bytes; /** @@ -1179,14 +1179,14 @@ struct DispatchSegmentedSort : SelectedPolicy /** * Random-access input iterator to the sequence of beginning offsets of length - * @p num_segments, such that `d_begin_offsets[i]` is the first element of the + * `num_segments`, such that `d_begin_offsets[i]` is the first element of the * ith data segment in `d_keys_*` and `d_values_*` */ BeginOffsetIteratorT d_begin_offsets; /** * Random-access input iterator to the sequence of ending offsets of length - * @p num_segments, such that d_end_offsets[i]-1 is the last element + * `num_segments`, such that d_end_offsets[i]-1 is the last element * of the ith data segment in `d_keys_*` and * `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, * the ith is considered empty. diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index a891f5a273..b2538c27bf 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -612,7 +612,7 @@ struct DispatchSpmv * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to - * @p temp_storage_bytes and no work is done. + * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation @@ -978,16 +978,16 @@ struct DispatchSpmv * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to - * @p temp_storage_bytes and no work is done. + * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation + * Reference to size in bytes of `d_temp_storage` allocation * * @param SpMV spmv_params * input parameter bundle * * @param[in] stream - * [optional] CUDA stream to launch kernels within. Default is stream0. + * **[optional]** CUDA stream to launch kernels within. Default is stream0. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index 2bf2760299..2fd3a84ee0 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -211,10 +211,10 @@ struct DispatchUniqueByKey : SelectedPolicy using ScanTileStateT = ScanTileState; /// Device-accessible allocation of temporary storage. When NULL, the required allocation size - /// is written to @p temp_storage_bytes and no work is done. + /// is written to `temp_storage_bytes` and no work is done. void *d_temp_storage; - /// Reference to size in bytes of @p d_temp_storage allocation + /// Reference to size in bytes of `d_temp_storage` allocation size_t &temp_storage_bytes; /// Pointer to the input sequence of keys @@ -239,17 +239,17 @@ struct DispatchUniqueByKey : SelectedPolicy /// Total number of input items (i.e., length of @p d_keys_in or @p d_values_in) OffsetT num_items; - /// [optional] CUDA stream to launch kernels within. Default is stream0. + /// **[optional]** CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; /** * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to - * @p temp_storage_bytes and no work is done. + * `temp_storage_bytes` and no work is done. * * @tparam temp_storage_bytes - * [in,out] Reference to size in bytes of @p d_temp_storage allocation + * [in,out] Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input sequence of keys @@ -274,7 +274,7 @@ struct DispatchUniqueByKey : SelectedPolicy * Total number of input items (i.e., length of @p d_keys_in or @p d_values_in) * * @param[in] stream - * [optional] CUDA stream to launch kernels within. + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION __forceinline__ DispatchUniqueByKey(void *d_temp_storage, @@ -538,10 +538,10 @@ struct DispatchUniqueByKey : SelectedPolicy * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to - * @p temp_storage_bytes and no work is done. + * `temp_storage_bytes` and no work is done. * * @param[in,out] &temp_storage_bytes - * Reference to size in bytes of @p d_temp_storage allocation + * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input sequence of keys @@ -566,7 +566,7 @@ struct DispatchUniqueByKey : SelectedPolicy * Total number of input items (i.e., the length of @p d_in) * * @param[in] stream - * [optional] CUDA stream to launch kernels within. + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh index 21b4895396..2249f39428 100644 --- a/cub/cub/grid/grid_barrier.cuh +++ b/cub/cub/grid/grid_barrier.cuh @@ -49,12 +49,6 @@ CUB_NAMESPACE_BEGIN -/** - * \addtogroup GridModule - * @{ - */ - - /** * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid */ @@ -221,8 +215,5 @@ public: } }; - -/** @} */ // end group GridModule - CUB_NAMESPACE_END diff --git a/cub/cub/grid/grid_even_share.cuh b/cub/cub/grid/grid_even_share.cuh index 30bba4bc28..fac48c20f0 100644 --- a/cub/cub/grid/grid_even_share.cuh +++ b/cub/cub/grid/grid_even_share.cuh @@ -52,12 +52,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup GridModule - * @{ - */ - - /** * @brief GridEvenShare is a descriptor utility for distributing input among * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly @@ -226,7 +220,4 @@ public: }; - -/** @} */ // end group GridModule - CUB_NAMESPACE_END diff --git a/cub/cub/grid/grid_mapping.cuh b/cub/cub/grid/grid_mapping.cuh index d9e019539c..3c1e36a9d1 100644 --- a/cub/cub/grid/grid_mapping.cuh +++ b/cub/cub/grid/grid_mapping.cuh @@ -46,13 +46,6 @@ CUB_NAMESPACE_BEGIN - -/** - * \addtogroup GridModule - * @{ - */ - - /****************************************************************************** * Mapping policies *****************************************************************************/ @@ -110,8 +103,5 @@ enum GridMappingStrategy GRID_MAPPING_DYNAMIC, }; - -/** @} */ // end group GridModule - CUB_NAMESPACE_END diff --git a/cub/cub/grid/grid_queue.cuh b/cub/cub/grid/grid_queue.cuh index 55cfecaa2f..6d62f42e03 100644 --- a/cub/cub/grid/grid_queue.cuh +++ b/cub/cub/grid/grid_queue.cuh @@ -49,13 +49,6 @@ CUB_NAMESPACE_BEGIN - -/** - * @addtogroup GridModule - * @{ - */ - - /** * @brief GridQueue is a descriptor utility for dynamic queue management. * @@ -242,12 +235,8 @@ __global__ void FillAndResetDrainKernel( } - #endif // DOXYGEN_SHOULD_SKIP_THIS - -/** @} */ // end group GridModule - CUB_NAMESPACE_END diff --git a/cub/cub/iterator/arg_index_input_iterator.cuh b/cub/cub/iterator/arg_index_input_iterator.cuh index b561ccd488..f8061ce9b6 100644 --- a/cub/cub/iterator/arg_index_input_iterator.cuh +++ b/cub/cub/iterator/arg_index_input_iterator.cuh @@ -59,11 +59,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilIterator - * @{ - */ - /** * @brief A random-access input wrapper for pairing dereferenced values with their corresponding * indices (forming \p KeyValuePair tuples). @@ -277,8 +272,4 @@ public: } }; - - -/** @} */ // end group UtilIterator - CUB_NAMESPACE_END diff --git a/cub/cub/iterator/cache_modified_input_iterator.cuh b/cub/cub/iterator/cache_modified_input_iterator.cuh index 644cb95c7c..b04963c72c 100644 --- a/cub/cub/iterator/cache_modified_input_iterator.cuh +++ b/cub/cub/iterator/cache_modified_input_iterator.cuh @@ -62,13 +62,6 @@ CUB_NAMESPACE_BEGIN - - -/** - * @addtogroup UtilIterator - * @{ - */ - /** * @brief A random-access input wrapper for dereferencing array values using a PTX cache load * modifier. @@ -259,8 +252,4 @@ public: #endif }; - - -/** @} */ // end group UtilIterator - CUB_NAMESPACE_END diff --git a/cub/cub/iterator/cache_modified_output_iterator.cuh b/cub/cub/iterator/cache_modified_output_iterator.cuh index ee78b9a695..086ae7eb56 100644 --- a/cub/cub/iterator/cache_modified_output_iterator.cuh +++ b/cub/cub/iterator/cache_modified_output_iterator.cuh @@ -59,12 +59,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilIterator - * @{ - */ - - /** * @brief A random-access output wrapper for storing array values using a PTX cache-modifier. * @@ -269,7 +263,4 @@ public: } }; - -/** @} */ // end group UtilIterator - CUB_NAMESPACE_END diff --git a/cub/cub/iterator/constant_input_iterator.cuh b/cub/cub/iterator/constant_input_iterator.cuh index bf6dc1bfdc..3ed7b2b70a 100644 --- a/cub/cub/iterator/constant_input_iterator.cuh +++ b/cub/cub/iterator/constant_input_iterator.cuh @@ -58,13 +58,6 @@ CUB_NAMESPACE_BEGIN - -/** - * @addtogroup UtilIterator - * @{ - */ - - /** * @brief A random-access input generator for dereferencing a sequence of homogeneous values * @@ -252,7 +245,4 @@ public: }; - -/** @} */ // end group UtilIterator - CUB_NAMESPACE_END diff --git a/cub/cub/iterator/counting_input_iterator.cuh b/cub/cub/iterator/counting_input_iterator.cuh index 384c44790c..f7295fce47 100644 --- a/cub/cub/iterator/counting_input_iterator.cuh +++ b/cub/cub/iterator/counting_input_iterator.cuh @@ -64,11 +64,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilIterator - * @{ - */ - /** * @brief A random-access input generator for dereferencing a sequence of incrementing integer values. * @@ -248,8 +243,4 @@ public: }; - - -/** @} */ // end group UtilIterator - CUB_NAMESPACE_END diff --git a/cub/cub/iterator/discard_output_iterator.cuh b/cub/cub/iterator/discard_output_iterator.cuh index c25f5a6e49..0b7ba3ef85 100644 --- a/cub/cub/iterator/discard_output_iterator.cuh +++ b/cub/cub/iterator/discard_output_iterator.cuh @@ -56,12 +56,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilIterator - * @{ - */ - - /** * @brief A discard iterator */ @@ -223,7 +217,4 @@ public: }; - -/** @} */ // end group UtilIterator - CUB_NAMESPACE_END diff --git a/cub/cub/iterator/tex_obj_input_iterator.cuh b/cub/cub/iterator/tex_obj_input_iterator.cuh index f5324458d9..94fa277ba3 100644 --- a/cub/cub/iterator/tex_obj_input_iterator.cuh +++ b/cub/cub/iterator/tex_obj_input_iterator.cuh @@ -61,13 +61,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilIterator - * @{ - */ - - - /** * @brief A random-access input wrapper for dereferencing array values through texture cache. * Uses newer Kepler-style texture objects. @@ -341,8 +334,4 @@ private: } }; - - -/** @} */ // end group UtilIterator - CUB_NAMESPACE_END diff --git a/cub/cub/iterator/tex_ref_input_iterator.cuh b/cub/cub/iterator/tex_ref_input_iterator.cuh index 18cf46bb96..d1453f1d8c 100644 --- a/cub/cub/iterator/tex_ref_input_iterator.cuh +++ b/cub/cub/iterator/tex_ref_input_iterator.cuh @@ -49,11 +49,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilIterator - * @{ - */ - /** * @brief A random-access input wrapper for dereferencing array values through texture cache. * @@ -117,6 +112,4 @@ template < typename OffsetT = std::ptrdiff_t> using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator; -/** @} */ // end group UtilIterator - CUB_NAMESPACE_END diff --git a/cub/cub/iterator/transform_input_iterator.cuh b/cub/cub/iterator/transform_input_iterator.cuh index 7ce36f1741..fb8266aecb 100644 --- a/cub/cub/iterator/transform_input_iterator.cuh +++ b/cub/cub/iterator/transform_input_iterator.cuh @@ -58,12 +58,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilIterator - * @{ - */ - - /** * @brief A random-access input wrapper for transforming dereferenced values. * @@ -265,8 +259,4 @@ public: } }; - - -/** @} */ // end group UtilIterator - CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh index 4d5d0f7fd3..4642482450 100644 --- a/cub/cub/thread/thread_load.cuh +++ b/cub/cub/thread/thread_load.cuh @@ -50,11 +50,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilIo - * @{ - */ - //----------------------------------------------------------------------------- // Tags and constants //----------------------------------------------------------------------------- @@ -416,7 +411,4 @@ ThreadLoad(InputIteratorT itr) #endif // DOXYGEN_SHOULD_SKIP_THIS -/** @} */ // end group UtilIo - - CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index c9d521190e..ad9b657f70 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -58,12 +58,6 @@ _CCCL_DIAG_SUPPRESS_DEPRECATED_POP CUB_NAMESPACE_BEGIN - -/** - * @addtogroup UtilModule - * @{ - */ - /// @brief Inequality functor (wraps equality functor) template struct InequalityWrapper @@ -428,7 +422,4 @@ __device__ __host__ BinaryFlip MakeBinaryFlip(BinaryOpT binary_op) return BinaryFlip(binary_op); } -/** @} */ // end group UtilModule - - CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_scan.cuh b/cub/cub/thread/thread_scan.cuh index 271d14c78a..7a25033b74 100644 --- a/cub/cub/thread/thread_scan.cuh +++ b/cub/cub/thread/thread_scan.cuh @@ -50,12 +50,6 @@ CUB_NAMESPACE_BEGIN /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) namespace internal { - -/** - * @addtogroup UtilModule - * @{ - */ - /** * @name Sequential prefix scan over statically-sized array types * @{ @@ -354,8 +348,5 @@ __device__ __forceinline__ T ThreadScanInclusive(T (&input)[LENGTH], //@} end member group -/** @} */ // end group UtilModule - - -} // internal namespace +} // internal namespace CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh index fc8b3beb41..62580c7640 100644 --- a/cub/cub/thread/thread_store.cuh +++ b/cub/cub/thread/thread_store.cuh @@ -48,12 +48,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilIo - * @{ - */ - - //----------------------------------------------------------------------------- // Tags and constants //----------------------------------------------------------------------------- @@ -414,8 +408,4 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val) #endif // DOXYGEN_SHOULD_SKIP_THIS - -/** @} */ // end group UtilIo - - CUB_NAMESPACE_END diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh index 135e8f8cf2..ba92f6bd02 100644 --- a/cub/cub/util_allocator.cuh +++ b/cub/cub/util_allocator.cuh @@ -55,12 +55,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilMgmt - * @{ - */ - - /****************************************************************************** * CachingDeviceAllocator (host use) ******************************************************************************/ @@ -872,9 +866,4 @@ struct CachingDeviceAllocator }; - - - -/** @} */ // end group UtilMgmt - CUB_NAMESPACE_END diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh index 44b6f57322..6f476fa807 100644 --- a/cub/cub/util_cpp_dialect.cuh +++ b/cub/cub/util_cpp_dialect.cuh @@ -25,9 +25,7 @@ * ******************************************************************************/ -/*! \file - * \brief Detect the version of the C++ standard used by the compiler. - */ +//! @file Detect the version of the C++ standard used by the compiler. #pragma once @@ -43,6 +41,8 @@ #include +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + // Deprecation warnings may be silenced by defining the following macros. These // may be combined. // - CUB_IGNORE_DEPRECATED_CPP_DIALECT: @@ -159,3 +159,5 @@ #undef CUB_COMP_DEPR_IMPL #undef CUB_COMP_DEPR_IMPL0 #undef CUB_COMP_DEPR_IMPL1 + +#endif // !DOXYGEN_SHOULD_SKIP_THIS diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh index c1e0991e4b..693af8c2e3 100644 --- a/cub/cub/util_debug.cuh +++ b/cub/cub/util_debug.cuh @@ -94,12 +94,6 @@ #endif // DOXYGEN_SHOULD_SKIP_THIS -/** - * \addtogroup UtilMgmt - * @{ - */ - - // `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only: #define CUB_DETAIL_DEBUG_LEVEL_NONE 0 @@ -332,6 +326,4 @@ inline __host__ __device__ void va_printf(char const*, Args const&...) #endif #endif -/** @} */ // end group UtilMgmt - CUB_NAMESPACE_END diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index 7c48e8a712..8b7dd6131e 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -65,12 +65,6 @@ CUB_NAMESPACE_BEGIN - -/** - * \addtogroup UtilMgmt - * @{ - */ - #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail @@ -860,8 +854,4 @@ struct ChainedPolicy }; - - -/** @} */ // end group UtilMgmt - CUB_NAMESPACE_END diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index 7956f655fd..cfadd98e13 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -49,11 +49,6 @@ CUB_NAMESPACE_BEGIN -/** - * \addtogroup UtilModule - * @{ - */ - #ifndef CUB_ALIGN #if defined(_WIN32) || defined(_WIN64) /// Align struct @@ -66,6 +61,7 @@ CUB_NAMESPACE_BEGIN #define CUB_PREVENT_MACRO_SUBSTITUTION +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, U &&u) @@ -81,6 +77,7 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, { return t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t); } +#endif #ifndef CUB_MAX /// Select maximum(a, b) @@ -143,6 +140,4 @@ _CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage) # endif #endif // !CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION -/** @} */ // end group UtilModule - CUB_NAMESPACE_END diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index b725b0646b..dd18583437 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -50,12 +50,6 @@ CUB_NAMESPACE_BEGIN -/** - * \addtogroup UtilPtx - * @{ - */ - - /****************************************************************************** * PTX helper macros ******************************************************************************/ @@ -515,8 +509,6 @@ __device__ __forceinline__ unsigned int LaneMaskGe() return ret; } -/** @} */ // end group UtilPtx - /** * @brief Shuffle-up for any data type. * Each warp-lanei obtains the value @p input contributed by @@ -524,8 +516,6 @@ __device__ __forceinline__ unsigned int LaneMaskGe() * For thread lanes @e i < src_offset, the thread's own @p input is returned to the thread. * ![](shfl_up_logo.png) * - * @ingroup WarpModule - * * @tparam LOGICAL_WARP_THREADS * The number of threads per "logical" warp. Must be a power-of-two <= 32. * @@ -606,8 +596,6 @@ ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask) * For thread lanes @e i >= WARP_THREADS, the thread's own @p input is returned to the * thread. ![](shfl_down_logo.png) * - * @ingroup WarpModule - * * @tparam LOGICAL_WARP_THREADS * The number of threads per "logical" warp. Must be a power-of-two <= 32. * @@ -696,8 +684,6 @@ ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask) * @tparam T * [inferred] The input/output element type * - * @ingroup WarpModule - * * @par * - Available only for SM3.0 or newer * diff --git a/cub/cub/util_temporary_storage.cuh b/cub/cub/util_temporary_storage.cuh index 22020c3fbd..5750b3be6b 100644 --- a/cub/cub/util_temporary_storage.cuh +++ b/cub/cub/util_temporary_storage.cuh @@ -49,11 +49,6 @@ CUB_NAMESPACE_BEGIN -/** - * @addtogroup UtilMgmt - * @{ - */ - #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** @@ -120,6 +115,4 @@ AliasTemporaries(void *d_temp_storage, #endif // DOXYGEN_SHOULD_SKIP_THIS -/** @} */ // end group UtilMgmt - CUB_NAMESPACE_END diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index 5220647906..6bd60544d5 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -93,13 +93,6 @@ CUB_NAMESPACE_BEGIN #endif // !defined(__CUDACC_RTC__) #endif // !defined(CUB_IS_INT128_ENABLED) -/** - * \addtogroup UtilModule - * @{ - */ - - - /****************************************************************************** * Conditional types ******************************************************************************/ @@ -1380,7 +1373,4 @@ struct Traits : NumericTraits::type> {}; #endif // DOXYGEN_SHOULD_SKIP_THIS - -/** @} */ // end group UtilModule - CUB_NAMESPACE_END diff --git a/cub/cub/warp/warp_exchange.cuh b/cub/cub/warp/warp_exchange.cuh index e603ed8d36..805df97cdc 100644 --- a/cub/cub/warp/warp_exchange.cuh +++ b/cub/cub/warp/warp_exchange.cuh @@ -73,7 +73,6 @@ using InternalWarpExchangeImpl = /** * @brief The WarpExchange class provides [collective](index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. - * @ingroup WarpModule * * @tparam T * The data type to be exchanged. diff --git a/cub/docs/benchmarking.rst b/cub/docs/benchmarking.rst index 590da8002b..a06abb1454 100644 --- a/cub/docs/benchmarking.rst +++ b/cub/docs/benchmarking.rst @@ -1,4 +1,4 @@ -Running CUB Benchmarks +CUB Benchmarks ************************************* This file contains instrutions on how to run all CUB benchmarks using CUB tuning infrastructure. diff --git a/cub/docs/index.rst b/cub/docs/index.rst index 90f82030ea..f91f4f6c58 100644 --- a/cub/docs/index.rst +++ b/cub/docs/index.rst @@ -9,6 +9,7 @@ CUB developer_overview test_overview tuning + benchmarking .. the line below can be used to use the README.md file as the index page .. .. mdinclude:: ../README.md diff --git a/cub/docs/repo.toml b/cub/docs/repo.toml index 7b33463955..23bc013609 100644 --- a/cub/docs/repo.toml +++ b/cub/docs/repo.toml @@ -74,7 +74,8 @@ doxygen_aliases = [ "warpstriped=Assumes a *warp-striped arrangement* of elements across threads, where warp\\ :sub:`i` owns the *i*\\ :sup:`th` range of (*warp-threads* * *items-per-thread*) contiguous items, and each thread owns items (*i*), (*i* + *warp-threads*), ..., (*i* + (*warp-threads* * (*items-per-thread* - 1))).", "linear_performance{1}=The work-complexity of \\1 as a function of input size is linear, resulting in performance throughput that plateaus with problem sizes large enough to saturate the GPU." , "plots_below=Performance plots for other scenarios can be found in the detailed method descriptions below.", - "identityzero=This operation assumes the value of obtained by the ``T``'s default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value \"zero\" for addition." + "identityzero=This operation assumes the value of obtained by the ``T``'s default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value \"zero\" for addition.", + "lookback=`decoupled look-back `_" ] # doxygen sometimes gets confused by macros. the array below allows the user to From c4769d7777336f7ce9fe38a12656b8401c5f8765 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 30 Nov 2023 15:10:26 -0600 Subject: [PATCH 2/4] CI log improvements (#621) * Add groups to script steps. * [skip-tests] missing quote * [skip-tests] Use function to only print group in GHA. * Fix color. * [skip-tests] Add group for env details. * [skip-tests] Add group to run_as_coder repro instructions. * Don't error on unbound. * Don't print script args * Color coder print message. * Avoid unbound errors with GITHUB_ACTIONS. * Don't run nvidia-smi manually in the test job. * sccache stats group. * Avoid sccache stats if sccache is not available. * [skip-tests] Inject intentional error. * Revert "[skip-tests] Inject intentional error." This reverts commit 7270a0cd0c5efd3e14f09bdb6355731e3bdf4a15. * Use preset name in group name. * Parameterize color. * Print sccache stats in group. * Add problem matcher. * Add problem matcher before moving repo files. * Remove the cmake regexs for now. * Try different problem matcher. * Just remove problem matchers for now. * Fix if * Remove redundant sccache stats. * Try adding problem matcher again. * Fix problem-matcher file name. * [skip-tests] Run smaller matrix for debug. * Fix path. * Use json array for matcher. * Fix json array. * [skip-tests] Disable verify devcontainers for now. * disable verify-devcontainers * Exclude home/coder from the path in the matcher. * Try a different regex. * Exclude leading slash. * Run as coder user. * Revert "Run as coder user." This reverts commit dace5f6963f58c4f545d0bf962aac2b987b6d75d. * Add ninja summary stats. * Fix permissions of ninja summary script. * Make color conditional upon status. * Make sure to get correct build status. * Exit if build failed. * Fix if statement. * Print when build fails. * Disable exiting on non-zero return. * Don't use local, it resets exit code. * Fix variable name. * Emit error. * Make sccache stats part of group title. * Make repro instructions a conditional step. * Get rid of old code. * Go back to putting the repro instructions in the command step. * Don't output error::. * Update problem matcher. * Don't capture cmake output. * Fix group name. * Actually disable exiting on non-zero return. * Add echo -e. * Fix spacing. * Redundant "build". * Add space to fix emoji. * Move end message logic into end group. * Fix group name. * Don't print in GHA on success. * Fix emojis. * Refactor group command logic into function. * Docs. * Return status from run_command. * Revert test changes. * Update repro instructions. * Remove excess. * Use print_env_details directly to avoid duplicates. * Update problem-matcher.json * Add timing to build/test scripts. --- .github/problem-matchers/problem-matcher.json | 14 + .github/workflows/build-and-test-linux.yml | 1 - .github/workflows/run-as-coder.yml | 22 +- ci/build_common.sh | 198 +++++++-- ci/build_cub.sh | 4 + ci/build_libcudacxx.sh | 4 + ci/build_thrust.sh | 4 + ci/ninja_summary.py | 381 ++++++++++++++++++ ci/nvrtc_libcudacxx.sh | 2 + ci/sccache_stats.sh | 8 +- ci/test_cub.sh | 4 + ci/test_libcudacxx.sh | 4 + ci/test_thrust.sh | 4 + 13 files changed, 606 insertions(+), 44 deletions(-) create mode 100644 .github/problem-matchers/problem-matcher.json create mode 100755 ci/ninja_summary.py diff --git a/.github/problem-matchers/problem-matcher.json b/.github/problem-matchers/problem-matcher.json new file mode 100644 index 0000000000..f196a5c884 --- /dev/null +++ b/.github/problem-matchers/problem-matcher.json @@ -0,0 +1,14 @@ +{ + "problemMatcher": [ + { + "owner": "nvcc", + "pattern": [ + { + "regexp": "^\\/home\\/coder\\/(.+):(\\d+):(\\d+): (\\w+): \"(.+)\"$", + "severity": 4, + "message": 5 + } + ] + } + ] +} diff --git a/.github/workflows/build-and-test-linux.yml b/.github/workflows/build-and-test-linux.yml index 32cfc25951..6c5ba40061 100644 --- a/.github/workflows/build-and-test-linux.yml +++ b/.github/workflows/build-and-test-linux.yml @@ -44,5 +44,4 @@ jobs: runner: linux-${{inputs.cpu}}-gpu-v100-latest-1 image: ${{inputs.container_image}} command: | - nvidia-smi ${{ inputs.test_script }} diff --git a/.github/workflows/run-as-coder.yml b/.github/workflows/run-as-coder.yml index 6d09fd220f..40bbf97b29 100644 --- a/.github/workflows/run-as-coder.yml +++ b/.github/workflows/run-as-coder.yml @@ -39,18 +39,30 @@ jobs: run: | cp -R cccl /home/coder/cccl chown -R coder:coder /home/coder/ + - name: Add NVCC problem matcher + run: | + echo "::add-matcher::cccl/.github/problem-matchers/problem-matcher.json" - name: Configure credentials and environment variables for sccache uses: ./cccl/.github/actions/configure_cccl_sccache - name: Run command shell: su coder {0} run: | - set -exo pipefail + set -eo pipefail cd ~/cccl + echo -e "\e[1;34mRunning as 'coder' user in $(pwd):\e[0m" + echo -e "\e[1;34m${{inputs.command}}\e[0m" eval "${{inputs.command}}" || exit_code=$? if [ ! -z "$exit_code" ]; then - echo "::error::Error! To checkout the corresponding code and reproduce locally, run the following commands:" - echo "git clone --branch $GITHUB_REF_NAME --single-branch --recurse-submodules https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA" - echo "docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}" - exit $exit_code + echo -e "::group::️❗ \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m" + echo "::error:: To replicate this failure locally, follow the steps below:" + echo "1. Clone the repository, and navigate to the correct branch and commit:" + echo " git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA" + echo "" + echo "2. Run the failed command inside the same Docker container used by the CI:" + echo " docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}" + echo "" + echo "For additional information, see:" + echo " - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md" + echo " - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md" fi diff --git a/ci/build_common.sh b/ci/build_common.sh index b398d5e582..7959acfb59 100755 --- a/ci/build_common.sh +++ b/ci/build_common.sh @@ -37,7 +37,6 @@ function usage { # Copy the args into a temporary array, since we will modify them and # the parent script may still need them. args=("$@") -echo "Args: ${args[@]}" while [ "${#args[@]}" -ne 0 ]; do case "${args[0]}" in -v | --verbose) VERBOSE=1; args=("${args[@]:1}");; @@ -90,7 +89,6 @@ export CTEST_PARALLEL_LEVEL="1" export CXX="${HOST_COMPILER}" export CUDACXX="${CUDA_COMPILER}" export CUDAHOSTCXX="${HOST_COMPILER}" - export CXX_STANDARD # Print "ARG=${ARG}" for all args. @@ -107,67 +105,193 @@ function print_var_values() { done } -echo "========================================" -echo "pwd=$(pwd)" -print_var_values \ - BUILD_DIR \ - CXX_STANDARD \ - CXX \ - CUDACXX \ - CUDAHOSTCXX \ - NVCC_VERSION \ - CMAKE_BUILD_PARALLEL_LEVEL \ - CTEST_PARALLEL_LEVEL \ - CCCL_BUILD_INFIX \ - GLOBAL_CMAKE_OPTIONS -echo "========================================" -echo -echo "========================================" -echo "Current commit is:" -git log -1 || echo "Not a repository" -echo "========================================" -echo +# begin_group: Start a named section of log output, possibly with color. +# Usage: begin_group "Group Name" [Color] +# Group Name: A string specifying the name of the group. +# Color (optional): ANSI color code to set text color. Default is blue (1;34). +function begin_group() { + # See options for colors here: https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124 + local blue="34" + local name="${1:-}" + local color="${2:-$blue}" + + if [ -n "${GITHUB_ACTIONS:-}" ]; then + echo -e "::group::\e[${color}m${name}\e[0m" + else + echo -e "\e[${color}m================== ${name} ======================\e[0m" + fi +} + +# end_group: End a named section of log output and print status based on exit status. +# Usage: end_group "Group Name" [Exit Status] +# Group Name: A string specifying the name of the group. +# Exit Status (optional): The exit status of the command run within the group. Default is 0. +function end_group() { + local name="${1:-}" + local build_status="${2:-0}" + local duration="${3:-}" + local red="31" + local blue="34" + + if [ -n "${GITHUB_ACTIONS:-}" ]; then + echo "::endgroup::" + + if [ "$build_status" -ne 0 ]; then + echo -e "::error::\e[${red}m ${name} - Failed (⬆️ click above for full log ⬆️)\e[0m" + fi + else + if [ "$build_status" -ne 0 ]; then + echo -e "\e[${red}m================== End ${name} - Failed${duration:+ - Duration: ${duration}s} ==================\e[0m" + else + echo -e "\e[${blue}m================== End ${name} - Success${duration:+ - Duration: ${duration}s} ==================\n\e[0m" + fi + fi +} + +declare -A command_durations + +# Runs a command within a named group, handles the exit status, and prints appropriate messages based on the result. +# Usage: run_command "Group Name" command [arguments...] +function run_command() { + local group_name="${1:-}" + shift + local command=("$@") + local status + + begin_group "$group_name" + set +e + local start_time=$(date +%s) + "${command[@]}" + status=$? + local end_time=$(date +%s) + set -e + local duration=$((end_time - start_time)) + end_group "$group_name" $status $duration + command_durations["$group_name"]=$duration + return $status +} + +function string_width() { + local str="$1" + echo "$str" | awk '{print length}' +} + +function print_time_summary() { + local max_length=0 + local group + + # Find the longest group name for formatting + for group in "${!command_durations[@]}"; do + local group_length=$(echo "$group" | awk '{print length}') + if [ "$group_length" -gt "$max_length" ]; then + max_length=$group_length + fi + done + + echo "Time Summary:" + for group in "${!command_durations[@]}"; do + printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}" + done + + # Clear the array of timing info + declare -gA command_durations=() +} + + +print_environment_details() { + begin_group "⚙️ Environment Details" + + echo "pwd=$(pwd)" + + print_var_values \ + BUILD_DIR \ + CXX_STANDARD \ + CXX \ + CUDACXX \ + CUDAHOSTCXX \ + NVCC_VERSION \ + CMAKE_BUILD_PARALLEL_LEVEL \ + CTEST_PARALLEL_LEVEL \ + CCCL_BUILD_INFIX \ + GLOBAL_CMAKE_OPTIONS + + echo "Current commit is:" + git log -1 || echo "Not a repository" + + if command -v nvidia-smi &> /dev/null; then + nvidia-smi + else + echo "nvidia-smi not found" + fi + + end_group "⚙️ Environment Details" +} + function configure_preset() { local BUILD_NAME=$1 local PRESET=$2 local CMAKE_OPTIONS=$3 + local GROUP_NAME="🛠️ CMake Configure ${BUILD_NAME}" pushd .. > /dev/null - - cmake --preset=$PRESET --log-level=VERBOSE $GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS - echo "$BUILD_NAME configure complete." - + run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE $GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS + status=$? popd > /dev/null + return $status } -function build_preset() -{ +function build_preset() { local BUILD_NAME=$1 local PRESET=$2 + local green="1;32" + local red="1;31" + local GROUP_NAME="🏗️ Build ${BUILD_NAME}" source "./sccache_stats.sh" "start" - pushd .. > /dev/null - - cmake --build --preset=$PRESET -v - echo "$BUILD_NAME build complete." + pushd .. > /dev/null + run_command "$GROUP_NAME" cmake --build --preset=$PRESET -v + status=$? popd > /dev/null - source "./sccache_stats.sh" "end" + + minimal_sccache_stats=$(source "./sccache_stats.sh" "end") + + # Only print detailed stats in actions workflow + if [ -n "${GITHUB_ACTIONS:-}" ]; then + begin_group "💲 sccache stats" + echo "${minimal_sccache_stats}" + sccache -s + end_group + + begin_group "🥷 ninja build times" + echo "The "weighted" time is the elapsed time of each build step divided by the number + of tasks that were running in parallel. This makes it an excellent approximation + of how "important" a slow step was. A link that is entirely or mostly serialized + will have a weighted time that is the same or similar to its elapsed time. A + compile that runs in parallel with 999 other compiles will have a weighted time + that is tiny." + ./ninja_summary.py -C ${BUILD_DIR}/${PRESET} + end_group + else + echo $minimal_sccache_stats + fi + + return $status } function test_preset() { local BUILD_NAME=$1 local PRESET=$2 + local GROUP_NAME="🚀 Test ${BUILD_NAME}" pushd .. > /dev/null - - ctest --preset=$PRESET - echo "$BUILD_NAME testing complete." - + run_command "$GROUP_NAME" ctest --preset=$PRESET + status=$? popd > /dev/null + return $status } function configure_and_build_preset() diff --git a/ci/build_cub.sh b/ci/build_cub.sh index f31ec4fd2a..d587c2a6ad 100755 --- a/ci/build_cub.sh +++ b/ci/build_cub.sh @@ -2,6 +2,8 @@ source "$(dirname "$0")/build_common.sh" +print_environment_details + # CUB benchmarks require at least CUDA nvcc 11.5 for int128 # Returns "true" if the first version is greater than or equal to the second version_compare() { @@ -35,3 +37,5 @@ CMAKE_OPTIONS=" " configure_and_build_preset "CUB" "$PRESET" "$CMAKE_OPTIONS" + +print_time_summary diff --git a/ci/build_libcudacxx.sh b/ci/build_libcudacxx.sh index 656851253a..1dc26f3228 100755 --- a/ci/build_libcudacxx.sh +++ b/ci/build_libcudacxx.sh @@ -2,7 +2,11 @@ source "$(dirname "$0")/build_common.sh" +print_environment_details + PRESET="libcudacxx-cpp${CXX_STANDARD}" CMAKE_OPTIONS="" configure_and_build_preset libcudacxx "$PRESET" "$CMAKE_OPTIONS" + +print_time_summary diff --git a/ci/build_thrust.sh b/ci/build_thrust.sh index 887f33b34b..6e4a82da0f 100755 --- a/ci/build_thrust.sh +++ b/ci/build_thrust.sh @@ -2,8 +2,12 @@ source "$(dirname "$0")/build_common.sh" +print_environment_details + PRESET="thrust-cpp$CXX_STANDARD" CMAKE_OPTIONS="" configure_and_build_preset "Thrust" "$PRESET" "$CMAKE_OPTIONS" + +print_time_summary diff --git a/ci/ninja_summary.py b/ci/ninja_summary.py new file mode 100755 index 0000000000..f496db534b --- /dev/null +++ b/ci/ninja_summary.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python3 +# Copyright (c) 2018 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +r"""Summarize the last ninja build, invoked with ninja's -C syntax. + +This script is designed to be automatically run after each ninja build in +order to summarize the build's performance. Making build performance information +more visible should make it easier to notice anomalies and opportunities. To use +this script on Windows just set NINJA_SUMMARIZE_BUILD=1 and run autoninja.bat. + +On Linux you can get autoninja to invoke this script using this syntax: + +$ NINJA_SUMMARIZE_BUILD=1 autoninja -C out/Default/ chrome + +You can also call this script directly using ninja's syntax to specify the +output directory of interest: + +> python3 post_build_ninja_summary.py -C out/Default + +Typical output looks like this: + +>ninja -C out\debug_component base +ninja.exe -C out\debug_component base -j 960 -l 48 -d keeprsp +ninja: Entering directory `out\debug_component' +[1 processes, 1/1 @ 0.3/s : 3.092s ] Regenerating ninja files +Longest build steps: + 0.1 weighted s to build obj/base/base/trace_log.obj (6.7 s elapsed time) + 0.2 weighted s to build nasm.exe, nasm.exe.pdb (0.2 s elapsed time) + 0.3 weighted s to build obj/base/base/win_util.obj (12.4 s elapsed time) + 1.2 weighted s to build base.dll, base.dll.lib (1.2 s elapsed time) +Time by build-step type: + 0.0 s weighted time to generate 6 .lib files (0.3 s elapsed time sum) + 0.1 s weighted time to generate 25 .stamp files (1.2 s elapsed time sum) + 0.2 s weighted time to generate 20 .o files (2.8 s elapsed time sum) + 1.7 s weighted time to generate 4 PEFile (linking) files (2.0 s elapsed +time sum) + 23.9 s weighted time to generate 770 .obj files (974.8 s elapsed time sum) +26.1 s weighted time (982.9 s elapsed time sum, 37.7x parallelism) +839 build steps completed, average of 32.17/s + +If no gn clean has been done then results will be for the last non-NULL +invocation of ninja. Ideas for future statistics, and implementations are +appreciated. + +The "weighted" time is the elapsed time of each build step divided by the number +of tasks that were running in parallel. This makes it an excellent approximation +of how "important" a slow step was. A link that is entirely or mostly serialized +will have a weighted time that is the same or similar to its elapsed time. A +compile that runs in parallel with 999 other compiles will have a weighted time +that is tiny.""" + +import argparse +import errno +import fnmatch +import os +import subprocess +import sys + +# The number of long build times to report: +long_count = 10 +# The number of long times by extension to report +long_ext_count = 10 + + +class Target: + """Represents a single line read for a .ninja_log file.""" + def __init__(self, start, end): + """Creates a target object by passing in the start/end times in seconds + as a float.""" + self.start = start + self.end = end + # A list of targets, appended to by the owner of this object. + self.targets = [] + self.weighted_duration = 0.0 + + def Duration(self): + """Returns the task duration in seconds as a float.""" + return self.end - self.start + + def SetWeightedDuration(self, weighted_duration): + """Sets the duration, in seconds, passed in as a float.""" + self.weighted_duration = weighted_duration + + def WeightedDuration(self): + """Returns the task's weighted duration in seconds as a float. + + Weighted_duration takes the elapsed time of the task and divides it + by how many other tasks were running at the same time. Thus, it + represents the approximate impact of this task on the total build time, + with serialized or serializing steps typically ending up with much + longer weighted durations. + weighted_duration should always be the same or shorter than duration. + """ + # Allow for modest floating-point errors + epsilon = 0.000002 + if (self.weighted_duration > self.Duration() + epsilon): + print('%s > %s?' % (self.weighted_duration, self.Duration())) + assert (self.weighted_duration <= self.Duration() + epsilon) + return self.weighted_duration + + def DescribeTargets(self): + """Returns a printable string that summarizes the targets.""" + # Some build steps generate dozens of outputs - handle them sanely. + # The max_length was chosen so that it can fit most of the long + # single-target names, while minimizing word wrapping. + result = ', '.join(self.targets) + max_length = 65 + if len(result) > max_length: + result = result[:max_length] + '...' + return result + + +# Copied with some modifications from ninjatracing +def ReadTargets(log, show_all): + """Reads all targets from .ninja_log file |log_file|, sorted by duration. + + The result is a list of Target objects.""" + header = log.readline() + # Handle empty ninja_log gracefully by silently returning an empty list of + # targets. + if not header: + return [] + assert header == '# ninja log v5\n', \ + 'unrecognized ninja log version %r' % header + targets_dict = {} + last_end_seen = 0.0 + for line in log: + parts = line.strip().split('\t') + if len(parts) != 5: + # If ninja.exe is rudely halted then the .ninja_log file may be + # corrupt. Silently continue. + continue + start, end, _, name, cmdhash = parts # Ignore restat. + # Convert from integral milliseconds to float seconds. + start = int(start) / 1000.0 + end = int(end) / 1000.0 + if not show_all and end < last_end_seen: + # An earlier time stamp means that this step is the first in a new + # build, possibly an incremental build. Throw away the previous + # data so that this new build will be displayed independently. + # This has to be done by comparing end times because records are + # written to the .ninja_log file when commands complete, so end + # times are guaranteed to be in order, but start times are not. + targets_dict = {} + target = None + if cmdhash in targets_dict: + target = targets_dict[cmdhash] + if not show_all and (target.start != start or target.end != end): + # If several builds in a row just run one or two build steps + # then the end times may not go backwards so the last build may + # not be detected as such. However in many cases there will be a + # build step repeated in the two builds and the changed + # start/stop points for that command, identified by the hash, + # can be used to detect and reset the target dictionary. + targets_dict = {} + target = None + if not target: + targets_dict[cmdhash] = target = Target(start, end) + last_end_seen = end + target.targets.append(name) + return list(targets_dict.values()) + + +def GetExtension(target, extra_patterns): + """Return the file extension that best represents a target. + + For targets that generate multiple outputs it is important to return a + consistent 'canonical' extension. Ultimately the goal is to group build steps + by type.""" + for output in target.targets: + if extra_patterns: + for fn_pattern in extra_patterns.split(';'): + if fnmatch.fnmatch(output, '*' + fn_pattern + '*'): + return fn_pattern + # Not a true extension, but a good grouping. + if output.endswith('type_mappings'): + extension = 'type_mappings' + break + + # Capture two extensions if present. For example: file.javac.jar should + # be distinguished from file.interface.jar. + root, ext1 = os.path.splitext(output) + _, ext2 = os.path.splitext(root) + extension = ext2 + ext1 # Preserve the order in the file name. + + if len(extension) == 0: + extension = '(no extension found)' + + if ext1 in ['.pdb', '.dll', '.exe']: + extension = 'PEFile (linking)' + # Make sure that .dll and .exe are grouped together and that the + # .dll.lib files don't cause these to be listed as libraries + break + if ext1 in ['.so', '.TOC']: + extension = '.so (linking)' + # Attempt to identify linking, avoid identifying as '.TOC' + break + # Make sure .obj files don't get categorized as mojo files + if ext1 in ['.obj', '.o']: + break + # Jars are the canonical output of java targets. + if ext1 == '.jar': + break + # Normalize all mojo related outputs to 'mojo'. + if output.count('.mojom') > 0: + extension = 'mojo' + break + return extension + + +def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting): + """Print a summary of the passed in list of Target objects.""" + + # Create a list that is in order by time stamp and has entries for the + # beginning and ending of each build step (one time stamp may have multiple + # entries due to multiple steps starting/stopping at exactly the same time). + # Iterate through this list, keeping track of which tasks are running at all + # times. At each time step calculate a running total for weighted time so + # that when each task ends its own weighted time can easily be calculated. + task_start_stop_times = [] + + earliest = -1 + latest = 0 + total_cpu_time = 0 + for target in entries: + if earliest < 0 or target.start < earliest: + earliest = target.start + if target.end > latest: + latest = target.end + total_cpu_time += target.Duration() + task_start_stop_times.append((target.start, 'start', target)) + task_start_stop_times.append((target.end, 'stop', target)) + length = latest - earliest + weighted_total = 0.0 + + # Sort by the time/type records and ignore |target| + task_start_stop_times.sort(key=lambda times: times[:2]) + # Now we have all task start/stop times sorted by when they happen. If a + # task starts and stops on the same time stamp then the start will come + # first because of the alphabet, which is important for making this work + # correctly. + # Track the tasks which are currently running. + running_tasks = {} + # Record the time we have processed up to so we know how to calculate time + # deltas. + last_time = task_start_stop_times[0][0] + # Track the accumulated weighted time so that it can efficiently be added + # to individual tasks. + last_weighted_time = 0.0 + # Scan all start/stop events. + for event in task_start_stop_times: + time, action_name, target = event + # Accumulate weighted time up to now. + num_running = len(running_tasks) + if num_running > 0: + # Update the total weighted time up to this moment. + last_weighted_time += (time - last_time) / float(num_running) + if action_name == 'start': + # Record the total weighted task time when this task starts. + running_tasks[target] = last_weighted_time + if action_name == 'stop': + # Record the change in the total weighted task time while this task + # ran. + weighted_duration = last_weighted_time - running_tasks[target] + target.SetWeightedDuration(weighted_duration) + weighted_total += weighted_duration + del running_tasks[target] + last_time = time + assert (len(running_tasks) == 0) + + # Warn if the sum of weighted times is off by more than half a second. + if abs(length - weighted_total) > 500: + print('Warning: Possible corrupt ninja log, results may be ' + 'untrustworthy. Length = %.3f, weighted total = %.3f' % + (length, weighted_total)) + + # Print the slowest build steps: + print(' Longest build steps:') + if elapsed_time_sorting: + entries.sort(key=lambda x: x.Duration()) + else: + entries.sort(key=lambda x: x.WeightedDuration()) + for target in entries[-long_count:]: + print(' %8.1f weighted s to build %s (%.1f s elapsed time)' % + (target.WeightedDuration(), target.DescribeTargets(), + target.Duration())) + + # Sum up the time by file extension/type of the output file + count_by_ext = {} + time_by_ext = {} + weighted_time_by_ext = {} + # Scan through all of the targets to build up per-extension statistics. + for target in entries: + extension = GetExtension(target, extra_step_types) + time_by_ext[extension] = time_by_ext.get(extension, + 0) + target.Duration() + weighted_time_by_ext[extension] = weighted_time_by_ext.get( + extension, 0) + target.WeightedDuration() + count_by_ext[extension] = count_by_ext.get(extension, 0) + 1 + + print(' Time by build-step type:') + # Copy to a list with extension name and total time swapped, to (time, ext) + if elapsed_time_sorting: + weighted_time_by_ext_sorted = sorted( + (y, x) for (x, y) in time_by_ext.items()) + else: + weighted_time_by_ext_sorted = sorted( + (y, x) for (x, y) in weighted_time_by_ext.items()) + # Print the slowest build target types: + for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]: + print( + ' %8.1f s weighted time to generate %d %s files ' + '(%1.1f s elapsed time sum)' % + (time, count_by_ext[extension], extension, time_by_ext[extension])) + + print(' %.1f s weighted time (%.1f s elapsed time sum, %1.1fx ' + 'parallelism)' % + (length, total_cpu_time, total_cpu_time * 1.0 / length)) + print(' %d build steps completed, average of %1.2f/s' % + (len(entries), len(entries) / (length))) + + +def main(): + log_file = '.ninja_log' + metrics_file = 'siso_metrics.json' + parser = argparse.ArgumentParser() + parser.add_argument('-C', dest='build_directory', help='Build directory.') + parser.add_argument( + '-s', + '--step-types', + help='semicolon separated fnmatch patterns for build-step grouping') + parser.add_argument( + '-e', + '--elapsed_time_sorting', + default=False, + action='store_true', + help='Sort output by elapsed time instead of weighted time') + parser.add_argument('--log-file', + help="specific ninja log file to analyze.") + args, _extra_args = parser.parse_known_args() + if args.build_directory: + log_file = os.path.join(args.build_directory, log_file) + metrics_file = os.path.join(args.build_directory, metrics_file) + if args.log_file: + log_file = args.log_file + if not args.step_types: + # Offer a convenient way to add extra step types automatically, + # including when this script is run by autoninja. get() returns None if + # the variable isn't set. + args.step_types = os.environ.get('chromium_step_types') + if args.step_types: + # Make room for the extra build types. + global long_ext_count + long_ext_count += len(args.step_types.split(';')) + + if os.path.exists(metrics_file): + # Automatically handle summarizing siso builds. + cmd = ['siso.bat' if 'win32' in sys.platform else 'siso'] + cmd.extend(['metrics', 'summary']) + if args.build_directory: + cmd.extend(['-C', args.build_directory]) + if args.step_types: + cmd.extend(['--step_types', args.step_types]) + if args.elapsed_time_sorting: + cmd.append('--elapsed_time_sorting') + subprocess.run(cmd) + else: + try: + with open(log_file, 'r') as log: + entries = ReadTargets(log, False) + if entries: + SummarizeEntries(entries, args.step_types, + args.elapsed_time_sorting) + except IOError: + print('Log file %r not found, no build summary created.' % log_file) + return errno.ENOENT + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/ci/nvrtc_libcudacxx.sh b/ci/nvrtc_libcudacxx.sh index 4a0d9f6e89..a33fb14522 100755 --- a/ci/nvrtc_libcudacxx.sh +++ b/ci/nvrtc_libcudacxx.sh @@ -2,6 +2,8 @@ source "$(dirname "$0")/build_common.sh" +print_environment_details + PRESET="libcudacxx-nvrtc-cpp${CXX_STANDARD}" CMAKE_OPTIONS="" diff --git a/ci/sccache_stats.sh b/ci/sccache_stats.sh index 8abb4125c2..3a3ebc421c 100755 --- a/ci/sccache_stats.sh +++ b/ci/sccache_stats.sh @@ -1,7 +1,7 @@ #!/bin/bash # This script prints the sccache hit rate between two calls to sccache --show-stats. -# It should be sourced in your script before and after the operations you want to profile, +# It should be sourced in your script before and after the operations you want to profile, # with the 'start' or 'end' argument respectively. mode=$1 @@ -12,6 +12,12 @@ if [[ "$mode" != "start" && "$mode" != "end" ]]; then exit 1 fi +# Check if sccache is available +if ! command -v sccache &> /dev/null; then + echo "Notice: sccache is not available. Skipping..." + exit 0 +fi + case $mode in start) export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}') diff --git a/ci/test_cub.sh b/ci/test_cub.sh index b379cc2cbf..9fd9feff48 100755 --- a/ci/test_cub.sh +++ b/ci/test_cub.sh @@ -2,8 +2,12 @@ source "$(dirname "$0")/build_common.sh" +print_environment_details + ./build_cub.sh "$@" PRESET="cub-cpp$CXX_STANDARD" test_preset CUB "${PRESET}" + +print_time_summary diff --git a/ci/test_libcudacxx.sh b/ci/test_libcudacxx.sh index c433199cc4..64736f430e 100755 --- a/ci/test_libcudacxx.sh +++ b/ci/test_libcudacxx.sh @@ -2,6 +2,8 @@ source "$(dirname "$0")/build_common.sh" +print_environment_details + PRESET="libcudacxx-cpp${CXX_STANDARD}" CMAKE_OPTIONS="" @@ -18,3 +20,5 @@ test_preset "libcudacxx (CTest)" ${CTEST_PRESET} source "./sccache_stats.sh" "start" test_preset "libcudacxx (lit)" ${LIT_PRESET} source "./sccache_stats.sh" "end" + +print_time_summary diff --git a/ci/test_thrust.sh b/ci/test_thrust.sh index a2895f9aea..1385ef560e 100755 --- a/ci/test_thrust.sh +++ b/ci/test_thrust.sh @@ -2,8 +2,12 @@ source "$(dirname "$0")/build_common.sh" +print_environment_details + ./build_thrust.sh "$@" PRESET="thrust-cpp$CXX_STANDARD" test_preset "Thrust" ${PRESET} + +print_time_summary From 79f8f712af88756934f93dc9a1daaeedb3c5f612 Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Thu, 30 Nov 2023 14:45:40 -0800 Subject: [PATCH 3/4] Setup documentation and corresponding github action (#1118) * Make top level docs for CCCL. * Fix image paths * Add Thrust docs build and fix any issues * Add CUB docs build * Fix PTX docs interpretting liquid syntax * Fixup libcudacxx baseurl in docs * Fixup cub permission in gen_docs.bash * Fixup thrust docs script permissions * Fix favicon in libcudacxx * Document `$TAG`. * Document scripts better. --- .github/workflows/build-docs.yml | 83 ++++++++++++ cub/docs/gen_docs.bash | 30 +++++ cub/docs/gen_docs.sh | 20 --- cub/docs/repo.toml | 2 +- docs/Dockerfile | 18 +++ docs/build_docs.bash | 21 +++ docs/jekyll/_config.yaml | 30 +++++ docs/jekyll/_sass/color_schemes/nvidia.scss | 125 ++++++++++++++++++ docs/jekyll/favicon.ico | Bin 0 -> 25214 bytes docs/{ => jekyll}/images/codespaces.png | Bin .../jekyll}/images/nvidia_logo.png | Bin docs/{ => jekyll}/images/pr-checks.png | Bin .../images/repro_instructions.png | Bin docs/jekyll/index.md | 5 + docs/make_env.bash | 13 ++ docs/readme.md | 23 ++++ libcudacxx/docs/_config.yml | 4 +- libcudacxx/docs/extended_api/ptx.md | 2 +- libcudacxx/docs/images/nvidia_logo.png | Bin 0 -> 50546 bytes thrust/docs/build_docs_locally.bash | 13 ++ thrust/docs/doxybook/config.json | 2 +- thrust/docs/github_pages/_config.yml | 3 + .../_sass/color_schemes/nvidia.scss | 1 + 23 files changed, 371 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/build-docs.yml create mode 100755 cub/docs/gen_docs.bash delete mode 100755 cub/docs/gen_docs.sh create mode 100644 docs/Dockerfile create mode 100644 docs/build_docs.bash create mode 100644 docs/jekyll/_config.yaml create mode 100644 docs/jekyll/_sass/color_schemes/nvidia.scss create mode 100644 docs/jekyll/favicon.ico rename docs/{ => jekyll}/images/codespaces.png (100%) rename {libcudacxx/docs/assets => docs/jekyll}/images/nvidia_logo.png (100%) rename docs/{ => jekyll}/images/pr-checks.png (100%) rename docs/{ => jekyll}/images/repro_instructions.png (100%) create mode 100644 docs/jekyll/index.md create mode 100644 docs/make_env.bash create mode 100644 docs/readme.md create mode 100644 libcudacxx/docs/images/nvidia_logo.png create mode 100755 thrust/docs/build_docs_locally.bash diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml new file mode 100644 index 0000000000..b3eee11d28 --- /dev/null +++ b/.github/workflows/build-docs.yml @@ -0,0 +1,83 @@ +name: Deploy CCCL pages + +on: + # Runs on pushes targeting the default branch + push: + branches: ["main"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Pages + uses: actions/configure-pages@v3 + + # Build helper image for Thrust/CUB + - name: Build helper image + run: | + bash ./docs/make_env.bash "cccl:docs" + + # Build top level docs for CCCL + - name: Build landing page + uses: actions/jekyll-build-pages@v1 + with: + source: ./docs/jekyll + destination: ./_site + + # CUB + - name: Build CUB docs + run: | + bash ./docs/build_docs.bash "cccl:docs" /cccl/cub/docs/gen_docs.bash + sudo mkdir -p ./_site/cub + sudo cp -rf ./cub/docs/_build/docs/CUB/latest/* ./_site/cub + + # Libcudacxx + - name: Build libcudacxx docs + uses: actions/jekyll-build-pages@v1 + with: + source: ./libcudacxx/docs + destination: ./_site/libcudacxx + + # Thrust + - name: Build Thrust markdown in Docker + run: bash ./docs/build_docs.bash "cccl:docs" /cccl/thrust/docs/build_docs_locally.bash + + - name: Build Thrust docs + uses: actions/jekyll-build-pages@v1 + with: + source: ./thrust/build_docs/github_pages + destination: ./_site/thrust + + # Upload build artifacts + - name: Upload artifact + uses: actions/upload-pages-artifact@v2 + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v2 diff --git a/cub/docs/gen_docs.bash b/cub/docs/gen_docs.bash new file mode 100755 index 0000000000..34ba008425 --- /dev/null +++ b/cub/docs/gen_docs.bash @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +## This script just wraps launching a docs build within a container +## Tag is passed on as the first argument ${1} + +set -e + +SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P) + +cd $SCRIPT_PATH + +## Clean image directory, without this any artifacts will prevent fetching +rm -rf img +mkdir -p img + +if [ ! -n "$(find img -name '*.png')" ]; then + wget -q https://docs.nvidia.com/cuda/_static/Logo_and_CUDA.png -O img/logo.png + + # Parse files and collects unique names ending with .png + imgs=( $(grep -R -o -h '[[:alpha:][:digit:]_]*.png' ../cub | uniq) ) + imgs+=( "cub_overview.png" "nested_composition.png" "tile.png" "blocked.png" "striped.png" ) + + for img in "${imgs[@]}" + do + echo ${img} + wget -q https://nvlabs.github.io/cub/${img} -O img/${img} || echo "!!! Failed to fetch $img" + done +fi + +./repo.sh docs || echo "!!! There were errors while generating" diff --git a/cub/docs/gen_docs.sh b/cub/docs/gen_docs.sh deleted file mode 100755 index 34d28b881d..0000000000 --- a/cub/docs/gen_docs.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/env bash - - -mkdir -p img - -if [ ! -n "$(find img -name '*.png')" ]; then - wget -q https://docs.nvidia.com/cuda/_static/Logo_and_CUDA.png -O img/logo.png - - # Parse files and collects unique names ending with .png - imgs=$(grep -R -o -h '[[:alpha:][:digit:]_]*.png' ../cub) - imgs="${imgs}\ncub_overview.png\nnested_composition.png\ntile.png\nblocked.png\nstriped.png" - - for img in $(echo -e ${imgs} | sort | uniq) - do - echo ${img} - wget -q https://nvlabs.github.io/cub/${img} -O img/${img} - done -fi - -./repo.sh docs diff --git a/cub/docs/repo.toml b/cub/docs/repo.toml index 23bc013609..eed418d51e 100644 --- a/cub/docs/repo.toml +++ b/cub/docs/repo.toml @@ -102,7 +102,7 @@ doxygen_predefined = [ "CUB_IGNORE_DEPRECATED_CPP_DIALECT" ] -# make sure to use ./fetch_imgs.sh +# make sure to use ./fetch_imgs.sh doxygen_conf_extra = """ IMAGE_PATH = ${config_root}/img DOXYFILE_ENCODING = UTF-8 diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 0000000000..f628b1b7f8 --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,18 @@ +FROM ubuntu:22.04 + +SHELL [ "/usr/bin/env", "/bin/bash", "-c" ] + +ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 + +RUN apt-get -y -qq update; \ + apt-get -y -qq upgrade; \ + apt-get -y -qq install doxygen unzip wget + +RUN mkdir -p /opt/doxybook2; \ + cd /opt/doxybook2; \ + wget -q -O doxybook2.zip "https://github.com/matusnovak/doxybook2/releases/download/v1.5.0/doxybook2-linux-amd64-v1.5.0.zip"; \ + unzip doxybook2.zip + +ENV PATH "$PATH:/opt/doxybook2/bin" + +SHELL [ "/bin/bash" ] diff --git a/docs/build_docs.bash b/docs/build_docs.bash new file mode 100644 index 0000000000..569f70c484 --- /dev/null +++ b/docs/build_docs.bash @@ -0,0 +1,21 @@ +#!/usr/bin/env sh + +## This script just wraps launching a docs build within a container +## Tag is passed on as the first argument ${1} + +set -ex + +SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P) +cd $SCRIPT_PATH + +CCCL_ROOT=$(realpath $SCRIPT_PATH/..) + +TAG=${1} +shift + +( + docker run --rm \ + --mount type=bind,src=${CCCL_ROOT},dst=/cccl \ + $TAG \ + bash -c "$@" +) diff --git a/docs/jekyll/_config.yaml b/docs/jekyll/_config.yaml new file mode 100644 index 0000000000..bc149cf4af --- /dev/null +++ b/docs/jekyll/_config.yaml @@ -0,0 +1,30 @@ +title: CUDA C++ Core Libraries + +repository: nvidia/cccl + +remote_theme: pmarsceill/just-the-docs + +color_scheme: nvidia +logo: /images/nvidia_logo.png + +search_enabled: true +search.heading_level: 4 + +# just-the-docs ignores these filenames by default. +include: [ "contributing.md", "code_of_conduct.md" ] + +plugins_dir: + - jekyll-remote-theme + - jekyll-optional-front-matter # GitHub Pages. + - jekyll-default-layout # GitHub Pages. + - jekyll-titles-from-headings # GitHub Pages. + - jekyll-relative-links # GitHub Pages. + +defaults: + - + scope: + path: index.md + values: + title: index + nav_order: 0 + permalink: / diff --git a/docs/jekyll/_sass/color_schemes/nvidia.scss b/docs/jekyll/_sass/color_schemes/nvidia.scss new file mode 100644 index 0000000000..6bd1ddcbbf --- /dev/null +++ b/docs/jekyll/_sass/color_schemes/nvidia.scss @@ -0,0 +1,125 @@ +$body-line-height: 1.4; +$content-line-height: 1.4; +.highlight { line-height: 1.0 !important; } + +/* h1 size. We make this smaller so the README title fits on one line. */ +$font-size-9: 30px; + +/* Inline code. */ +code, +code.highlighter-rouge +{ font-size: 0.85em !important; } + +/* Code blocks. */ +pre.highlight code +{ font-size: 0.9em !important; } + +$nav-width: 300px; +$content-width: 1000px; + +$body-background-color: $grey-dk-300; +$sidebar-color: $grey-dk-300; +$border-color: $grey-dk-200; + +$body-text-color: $grey-lt-300; +$body-heading-color: $grey-lt-000; +$nav-child-link-color: $grey-dk-000; +$search-result-preview-color: $grey-dk-000; + +$link-color: #76b900; +$btn-primary-color: #76b900; +$base-button-color: $grey-dk-250; + +$code-background-color: $grey-dk-250; +$search-background-color: $grey-dk-250; +$table-background-color: $grey-dk-250; +$feedback-color: darken($sidebar-color, 3%); + +div.highlighter-rouge, +pre.highlight code +{ background-color: #111 !important; } + +.highlight span.err { color: #ff0000; font-weight: bold; } /* Error */ + +.highlight span.ow, /* Operator.Word */ +.highlight span.k, /* Keyword */ +.highlight span.kc, /* Keyword.Constant */ +.highlight span.kd, /* Keyword.Declaration */ +.highlight span.kp, /* Keyword.Pseudo */ +.highlight span.kr, /* Keyword.Reserved */ +.highlight span.bp, /* Name.Builtin.Pseudo */ +.highlight span.vc, /* Name.Variable.Class */ +.highlight span.vg, /* Name.Variable.Global */ +.highlight span.vi /* Name.Variable.Instance */ +{ color: #76b900; font-weight: bold; } + +.highlight span.n, /* Name */ +.highlight span.h, /* Name */ +.highlight span.na, /* Name.Attribute */ +.highlight span.nb, /* Name.Builtin */ +.highlight span.nc, /* Name.Class */ +.highlight span.no, /* Name.Constant */ +.highlight span.nd, /* Name.Decorator */ +.highlight span.ni, /* Name.Entity */ +.highlight span.ne, /* Name.Exception */ +.highlight span.nf, /* Name.Function */ +.highlight span.nl, /* Name.Label */ +.highlight span.nn, /* Name.Namespace */ +.highlight span.nx, /* Name.Other */ +.highlight span.py, /* Name.Property */ +.highlight span.nt, /* Name.Tag */ +.highlight span.nv, /* Name.Variable */ +.highlight span.kt /* Keyword.Type */ +{ color: $grey-lt-300 } + +.highlight span.c, /* Comment */ +.highlight span.cm, /* Comment.Multiline */ +.highlight span.c1, /* Comment.Single */ +.highlight span.cs /* Comment.Special */ +{ color: #009966; font-style: italic } + +.highlight span.cp /* Preprocessor */ +.highlight span.kn, /* Keyword.Namespace */ +{ color: $grey-dk-000 } + +.highlight span.o, /* Operator */ +.highlight span.p /* Punctuation */ +{ color: #00ff00 } + +.highlight span.ge { font-style: italic } /* Generic.Emph */ + +.highlight span.gs { font-weight: bold } /* Generic.Strong */ + +.highlight span.l, /* Literal */ +.highlight span.ld, /* Literal.Date */ +.highlight span.m, /* Literal.Number */ +.highlight span.mf, /* Literal.Number.Float */ +.highlight span.mh, /* Literal.Number.Hex */ +.highlight span.mi, /* Literal.Number.Integer */ +.highlight span.mo, /* Literal.Number.Oct */ +.highlight span.il, /* Literal.Number.Integer.Long */ +.highlight span.s, /* Literal.String */ +.highlight span.sb, /* Literal.String.Backtick */ +.highlight span.sc, /* Literal.String.Char */ +.highlight span.sd, /* Literal.String.Doc */ +.highlight span.s2, /* Literal.String.Double */ +.highlight span.se, /* Literal.String.Escape */ +.highlight span.sh, /* Literal.String.Heredoc */ +.highlight span.si, /* Literal.String.Interpol */ +.highlight span.sx, /* Literal.String.Other */ +.highlight span.sr, /* Literal.String.Regex */ +.highlight span.s1, /* Literal.String.Single */ +.highlight span.ss /* Literal.String.Symbol */ +{ color: #119911 } + +.highlight span.w { color: #00cc00 } /* Text.Whitespace */ + +.highlight span.gh, /* Generic.Heading */ +.highlight span.gp, /* Generic.Prompt */ +.highlight span.gu /* Generic.Subheading */ +{ color: #00ff00; font-weight: bold } + +.highlight span.gd { color: #ff0000 } /* Generic.Deleted */ +.highlight span.gi { color: #00ff00 } /* Generic.Inserted */ + +.search-input { color: $body-text-color; } diff --git a/docs/jekyll/favicon.ico b/docs/jekyll/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..424df87200c706460f9ad1c7722ef0d35f286f2b GIT binary patch literal 25214 zcmeHP33MFAnf_ZXd&ZV!naGC>!X8`L#s@^339*=AG3F%%2(Td`0YYHKZfwp41JlG< zfZ7Zop2Zoiut~6CAT|+s5lq4~av)>Gz1)X6mVj6i!b{kVI1-e^So?idUDG|%Sh8iv zVs^^ullRHy1IHiB9@Gi#>NTAE9Jz|BG-vXET-#kRfuc`ZNday-`x^9UJMedqPeqBFtl*n;24S!75%@TQVj^Rfge5W01_+xTnrO4#tM0SS{ zFd@wV{bF);rpm=1E*e2`U7g_*MZlGOK97ce^(X!PECKjO5j<(oQS!wW=KN zXcRk>iRV~@NJl2ik`1y{wYPwv8nW?36ASS;$ZJr@#xtx(JK|)pVKb7UKhl*{2JH^| zPxfRo63@6IiG*q=KiTb!Tb0S8A(Bm?Rq?0eosw-5I}y)R7LDp|y3Y&=eWNh&1^J3+QH_*CNnxv^}dM=*(!ok?QU7|(WcVWzhy*e5+~ zDzYj9Rq;$5)5Qfgn}vOmvyxmF(pdT8^cUS-gDO`f|2`y$M@KTAp?;OBIL2uDvCqeHz{h?d2R-&fIrOm`s=BfpP>!9sw+|BNr}9 zemoIaFni0Xl*Q_biVCafFR0a(5ZG4!n(FF^wS_cgp#FwhY-x3h)l~XkZONb65jt-8 znKz85NNN7L^rAW@^%E!6#R8%EH6hhQ9{RkpL$f3jLi;3UpRjb6HY!t)H&Z5In9YU3uyUYgbm-SgWYYf9T*K z&92a0R`G;wU7HV=tlE0o!5<|jqc9UWoW-a1r$)%3T?aq@0)83-+dNP;zb?!L*%wf* zs1Nv457`@LErsOsQymUa;CG|AJSqVnW;hF7fhnZYQ$iDFuxkBh#5y64(&@U zhz(0dDRhp*gx;0LI9}saiE3!5C_ZG-iH(sOP6DJ^ zSP-iXIT0!rf^E2woB3HneYn2`2{k``&xi3}$!shC+*rfR1`R(m)tR{Z%4r9SfPT#6 za~9J`au-X+5a&GQbE_faT6}KhKc`lPCJ`4Hv&irn_3rP=Xg;HM;Td&(i;OqVsC+)X zr&T?lPQCX6nU=p$W^QPcIe0$Zy?d3+@4nQ3M!iBl_rR617oJm_9=clgdH7n{56`KK z@tk_VfBd_8PCfXsAIV{l-zZ;y@+Lf|X5=V5rylprTKQ)@qaF_|#WU(jct$-H�&2 z8TE`8?v&JvS^4fuzmVlX`584de&u300^1n@wSa7=j!%$y!{Dt+Uu)y zW!~fvSD`&>jhZ@lo~h1=`igwQLOKtJTYW<^VJW5W; z*H5!7ykLiBofNOCt9JBazC{g&L1czkY4xfZM3vYg zqpTUsxQm-L%K8j4coAyHre4LK{Zj>BIjWRnn#%Cc)%(C%wl5rN2g_uAkDC@6y+B?m z8G8IxC<4v8q~r=8QwF6%(+*qU?yO|`KY8!q?^6onje#1@3hh&>u}7JJbMz_q zu9P}pdj2Ab;rx8J`*42l?@bomf2kxMzDD+awU;qun#!|`;en?Mt=OWx5>%R z2YZe)vFC7bCf@RLZigMyvg73le3~MF^=X9NsWc0^lAoYXj?~=+^;+E0rtZX>S$!oI z7*wmLPS{0iNkP3*aI|Puk3s7sv{%>4Xuct>tCBi(PO|e(vwU52H)H1FCdrNJfiZ}%iUA<-;Y2pRe^c)67Z&vUgVpA%^xF19)HiN!Q40k zzTJTQDd0VnV@2bcOI3ls8*mWX&O&|*@FL28L9XXksku^$#(6oIzU}^d&MfG;7WV%L z4Afcde^F;bm0W6P$Q}WFAG%&ezNtvpU_bRWHJHBj{^wFPuyq;mCm=j0IA3lBP6M*Q zBfvwzdf;ZD1vn6x80tY#%%v*Pwij?Q!1?zf5Y|6bdF%Z5oF?deQN<{T`ID$03A%p( z373Xmf8GUu4mbvU=B0bk50-MN(P&?c{8r?D0tSxn!1Zi+|8uFMq33<*G_rq2IRzS? zi$ckBYC-ovzXSLwkVpNu$Oq1I#*^#pa)9TWrIY7ONBb)9_97qI{K=)3K>lxl8N;_w z_P_KCv$>QF91YM{_A6H8OfEGFwE4ifz^j1KWAfjCb}UeM@GFvMC6}5CTnqdaFmfi} za{N8#o9MUpU614MQ0MD@omlpqN0m+;AI{swpnVNE2G}2U9zg5FdG0yi23`)ynEBBK z`V7NbGWVR%L+)wtg~y__|DLlK_}){zdfssUgyVhy+Vr^U_J>iYznm{xzwYzw;l<^T0cC)+jFBiG*r z%B4Po@~=bfohb7ihEDW9kTde({AslNUz~x6UXyPtAw!!J0Sn;0?{Ofk$CQix_nZXy z{))buIl%cJp7Y;RZPevCe*lWV;Zi#O;qj(kt^s?SE~Curq7BgNelRySg2wphds{%) zzG)u<|2?My{P%%B7*Ef+Ib^2+?cq7QywGL}gZ+OAGP?l0uLXK@DfTJo1LwTwoK^6( zm^UmpupIcGOC6wWP*#I`uRV=xC|?q4=X};*nJO@#P&|J)b_c0uz0O{ax{k5d{|D5A zxkScXY8kLCd%nPbeQz|^HOGjt3%>b~OI?Y&?!S)D#YOGfKJQDECEu0G_wa}3oD6y& zd^B<7et}<~3zFxYA8HT#Q>q;H|03wteGhy%qo7#NQ#WLvK6Jqr*IHZg>G|_})XxKg zz8fXnhk6e0bH@KCQ4ZE5@_EkJK^tn$m!ALnT3-rTy;sm<%sFs^F|YDm>K{Q1uLtDW z2zUVZX~q6fNbBbM7uNke>bnfk{YB8;1gTWKNCmNk*tcIRH1%oJXFXlNJK=)tg z_n^L8m<@HV&(o0~2k^f0ugDJtjsjfN>8I|Wo-^#jHK6mbJV>6i6!Z^3*S?yZaT&Ti zIR1fFj{70ND@q6T=)HeCa-6r9NX6^Vz_I6?TU;c?xt2?<0S5AJZGC9}X-q!@o`am% zr5RJ+2i5{#1APke@b#;C^q6s-{~2&7_%x5!$$JgQN#Dx@y&rA-!@=VH+;h$WUV|^; zIr#x-F93IeemZilk$h%Yh#|W{uCT3QQ*Eam*V=;3|xTn8Ge~#!1^u7xsM6ovxd7S zhi?zA>+SP@V3&rzeoMt~duboN|K?f@Vq~uQ@HI1ZxzsgLT3^}!@EppexW`=mcf{FY z^W}0m0_6ylBT$Y&IRZOc1oU@7NAmx5C^^Lc*Wp*V@Z{3p+obdSB7DOXbrJP@FPkk_ z{SM5gh}gZL_tiDYCgi=1wzMMeX_h>HXC|L_8~Hmk`IT-RgK z+sZt1(YckMrs(2RF~k z^!p~h3k#n4M&`%x_W6Fvb9jc=3A_oMRPfnz_+5l~z&8NC@3;^6Ghoh}hRe5X-*Y0+ zb2h+tGStKV-VIy=@SMFBSOV+~j0yQffBy+s1N=8Ilnc!S?k#SjL%tKLhCVRnM7=@@2rOz;s{^a5!)SpwD>q z`Bx70dCFJwq2J|v8T7}2a6F3bYrVhqd7ehjGhZ9v*z(MlXN*??mjfJwgMm>%4X_g6 zyH)D%1I`74^KBgu>T&?{J}$7Y$NLeqQJ(Q^0Qe4SH9#6;F&h{d{}(|!P<2(eKZrW{ z37$^}XX2i-6Y7kiIbRRkZw1}RE%Ei802;^bEkOTYtHJgu!*?4O0z5+<4O|D9@$Nx+ zFW?k_dW;R`+P>$+P-bjNdk&x;#?;LHrDiBW3-|vQsNV)qj`rBbm=P-geIBa)Gyasa ze?52&{a^gPFZF5cTR4WsKIhyFfVR?rvDFsxmDg`6yT+F3KgY??T7COsY@O|q;0Ch0N`d2>m+~hg3(LFFX_PLI&1?VgLai;GB$MP)T_dfmC$a(z* z-&66Nsi<=tsGEHHdj*{PdhQP2e;U($I|;d2*Vmw|cWa)*b>wK^hrk)Y=Yd0ks{wQE zKY_CT7QSAWZ%2J7+w?b>|0bAvOz%V+Y%%2*-uPTP~L)@c%Ia`n+MlXOCF@J~odNt52G(-dMfW<3_B!SQ}i~Z^MN#94j=7 z6>T18MeRzfa&**MDHE)o<{0>0YjYGjz-;skuu!P-oPsyH7+}sR_AA~wqU95sS`Hg8 z?MF2Z3-|ES->>aF{VQ0@J^+TkSKoHZqCx~NI)eM4Zvz_uzK>lD>;>!%aDT8KFnesh z7905+mvFE1bD#^@1NaVbm0y1ic?&QW*crGGco*mw%3u5q+RXyIFYgIF4^Z}gU={EM zfX^SVs4^Y)oCAOlmArmldmr^_z_Y-&fI|TK$bA~^?+(x|W5PX|zMs>+-d9lnbl{J` zfdJ+CosuZPd*gose9pKApk58i-<4v2Hv-hpdpG&7^UEh7?*YCFd=~f}P%O{>+_aQ- z&IQDBk&ebENAKi<+T=(AD^^8 nWL_HB_S$BWKTk5o3U?0!d0QgQe;a?B}=kzlZr~3 zk!;zC8IdLXexK>{{e16!_kZ`l_npUeJ?b(uXI`(fJkL4LbKcj#r;PPjS$4A^gjn^D zA2mUUJqw{NUQ7(|o8SW8CHRBc^SA{LA+c?=e=&DBw+#vyqQOIB2{I6&H^_%~y;%`j-R}=rg z1&T2(Lxi+-qhkCk2J6DE$Fv{#6GBfNq@k|g;`9#3k=BCn#o=!qbG<)0+~zks8b-{m z-~S1~#K(t{V1Vp;p7|3gV|k@kZB*sQ$I5@(Fqty(L4mn{ytVNX7f8)`d0=GK&9rt( z^z=7B`mq06g7)VU6YXoC?|fa{89;A#c>cg@XvxfiCpz=>&pfeo0NzD>{crO)*-!rH z+kqgMYya;L(eGa}2C-d`m#~Zp7P`Uw!*OKZ&insJAv7E~CHz`t)7!;+#qDyXoWTEn z8lizqTI%4wckN9CuGUfhQ1AHGKVK!#UR67o>g{fF$WMN2>x~YfKVH^er@hSQ#B#{8 z_4mNdOHrV9Ta$`rmPIzL{`(bRuldjd`+rAN_gwo;WzXo&C;N zEF^nI?ARajPzOyfF@A#;p4`4CZvEj)XgWj$j|K{{&rX%;-m(!HQds}=r_5;{+RL7A zb3N6e>e%zA%xQVrOB_d^5TADltq)3kb2+JO$^FMW2ukpV%=7iQ0gn}#s2E<6hbMGa zxW7;+H_JDHKIA-a?`is~YC#mod z9z+O|V-`T=RW3gQ_(EIg&$?e{ZF)78_U24G1`WiR^>6Yvdi|U|vj6eRs=?@3UaMBR zAo<=Hqb&~@F@=e)OC@!S@swDr>s{~05mAF=F;HCa{K77T%xOiW-$5g+<~2UE@T+50 zydoDkvcaV`$ub2zW*6rDlwRH_F2Bpu@zRH{v8eN8?u7rx4b~(?5UAP_WqrLPxuw!v z04=4qb9rv<6K6nnv0&!%3JJ}{IirE;x8V9j?&_d;N6x1tkMn+SRL`|47+C}e(fO!Y z&|TIrdub^PvgkAF_B+hmd%5A8*%9NRIENyZo?DCmV$jl6nl$EcF`{FE{K_laBu|I^ z2zd(3y53l39pCjN)$X;@w!xa(l;u9U)S(Ak(1xssGTs4lr zv3+?jLO5CkJbOz)8LfV&MIOeqU=V@tqibxJ!np8biOV9gnpClB%DF>IJSHE;JLOrC z@|;gtHTzfVReEjR1~0c4bbe%G%58)=I5}xWyeBLB$psJ-z| z-$_YugYrbh(EdlEd-@Z`vP;zWGw)<8mL>M3?bqS1)?-0sRSSOW>}b`oe2WeEV^11f zb1BIpcrsR;AoW}ST2%EA^^;0DT2&Hhn_X3R_v-mdZOtS#y(&O@J)BPlL2E;vi zTQSIgJj?(#rM`|Vtd~wa+Ksx)(;ebsmc4JN@Y6v|#@$W|Q%^P#*%Mya5%a!P!Et2Q zmm7X(GtFd-#-e=~5dTns#TkvdfX%Hz2%Tcgvh!Vd1Z_`n_I6b??VfD;s7ZQ~(~BL0 zCs z!b6Ub*1@RJx@2b?Gdh%Oa(-i`x|*kU|J1(=7-Z*APvkG(X!;2L2qz_GFsZ&Qu3p1k zo}1d#>a|EB%iYVy#Kia)DcTw7(s4LLY@TdQNPJi|B{Y?tmiqXE$NN{xN;xs*&m&dd ztMN5tl?Zyq@W(gx|D+!Xx_PLkoJvDWnQ|`%@y@%gR0z#f$N?)r{OQFxUUS5zfcw!)-?-|J;CPuBl!G*Mo zU5j>T`RVf%wA;bg<>SxZF)z=bWX&sy`=+m?%C3jDa=SEd`c;D8k z?b#EpS|4u=wNB2|o16rSxoWmb;nz}^l$WFVy@pM*EYblc5i<9JQ0Cnb0tW~;O)$A+ zI-tGKgHVh{K4LqGdD8MuGjxG9isM?v5l#S*!vWdFP<6<6wxsA_}*&2GvcbSmf0bxAijgcEF z^k&+(C?mZY)AV!o;KtBFr{y)LSuKRfUshklIXC;y!mK8#op%S6^qc~J!qcWkg zu)%+L-aw+^vY<@@s6NK2w*@nhm2luW9k-W@1R@gQnwbvulq2tFn9?ZYDlfgKc|4Ac zH8f}DQ8PMbiX?G^_a~5#kPA2S|kk&nVo^FgYo}SR)(3rH9aTLI1P!IAzra(QF``%-T zWRo3q$_C)j7ojSc001HN`x$7*8b}l$;zdM%nhNdfDZL!OaxRfLXM!tsJ+G-sAI4Yc zp4XgxBmzMqHT8lmLr^9tej@qEwX>7@%-@uZ&X|GA!O7GUOo9teA6)f9h@R^(BY}ZA zLrGTe?f91v-t!{2`YJ|dKh*++6L$7n7)qR+Q4XH8_OUGxj_f-a&GPhq#gpIn>|{rT zjL!J%OHUK-=`d$sCzFkeg=cfTwdkxa^+2`m(UG3WGvEvR!s;BkUU z(mKIH1v$s8v!$Fe&=2*)Al_(WDms;AtC}Is{j{$@fK+@(TV3yM$k;7SDZN!d%_HaV zV0I0OJ=WyvRQLB;8i7#{#5lepPJe{NK`(g zKUF0zmRWRYCqh@rP=QH$RlEzX&}GKVrHuKr+1?#$5m?Vx3+&FPyzjuB$dZDdi#!nY zH10rZOG~LkI-lgQU&4YToMXY}NK*8p9q8+&<oKqJdSU;@OqfI~T9dra*EXiViMI!Zs^N9}i3&zI30=8YwyN?NS8p$7TAy*Q(4GA8{sq;a=Yn zzjX`P|NQMCuj!sDYUN8V{Q4*2Am04P8wDdxJv2aGDf^UEM$8^X^)|V%)eIgpGyUBnYRo8b!VXr zLEJx?KYc{R+Y=Qn$~^lJ+NKS$?)wTWb&WKBXIQUe7QAVRS#o;+&Axx&ixJ^ij>+|W zsua1abkj+fS&q#je5)ZMq^NgqM+4L;5wT4B_EX^W91xZQR_^k)?x*Xc?2T%DG_=M7 z+1j{{c_idR0jhn~L|rvgENAJ!RK)try5=nW(}}Af2nErh>5M61=Ohg^MA%qA$&BpO zKxO{?pA4U!z6SVnm_mL)X_8J)c3K(<^u1TJrHPV|Y>4GIfdb^bmuyZo)BBJQuAS^# z1{QzcuPWK(mx7Ua4zgpfufh+h)Ze#4g!C~mF<*I)h_$;{d`ytCr^QwN%Jd`bCP#Z@ z%gSYwgURUkWf?S*O(s81tI(~5;N+6BDNbU}ea}H_iG~Z{j<;*rDj`aqfqTgUcAIy7 znK50eW|IY{Gh9-aV#JGHlM6ia+Y29v4Cx*ME130Z2&hK)2b$2u4G6gkTAigs3a`XZ z<>_EL?@aAd9X<^H7sm`a*&u0>BIfp(U%D}VdO44uT(YB{YSh9_OUplN&Ty=^S{-hm z!DV52gMv+`8f?K#W>l(9HRT@8VdY!LKCw;^m(HFki zx9#`U+WS;|v{n=HuN8XIB^+tNph-In4;ZphG&ue2 zu$)S0B&pb&NQT-&=|7P#UzrgCA<#F!GHQV|XM5$I(;@N?wOc}}$d#)-=Zws4ETSgR z&Qt$oy*kq9u4s)FNuD`_zV7y6-Wp~IXUJ@vM>lz=0@q*4e$MCrG8KmBTW4#HdN1zfRO&1J6ABu@?BIk=3jDT_k!$jR$qCz5XBbtZvt2Ed3Q+ z@owFFHrx0GUKifKdyu$X7f>*sT*O|Tyj33&Y`Wmvk3`~jMMgmjp2i8`)o)(;z3+`q zr6}AZ=s2er&+lvW3Y)?$F0nn>dU~g#X1ylkkwQ)IksEM` z>LR?BtbOqmQ$2;qNlQkf=?2pMTg4Lrgd|6a%l#ErdA1E2sRzmyUSHdbh-hLfnm$=8 z#e4H$Cqe?WW4wdR=#STOvaP3Rv!=_I?W?EyRHlpClH{-D>qDg1Oh|3q_p1n1IsU%= z4wnK+7sTwtwqTZcf?RICtz<%(qKBzy+m!c>fU(f@vkvD#PAteNw{@@Gei`zl#BYsA zni_zUunKsF%lex{j4lJF9TSORBla)F+3kZ$5@?eZp-47jp-kCv!$q9ZiAgEFxB5%` zx#rhnR(}cTIObCt7mq6wS!mV0rbNt!p6a0QN1el^>E{wNugHi%BHat(!a9#Qocer; z9^nr&2@r&>-R1S(YVg~AI}@k3wDtF45Nc1PQfA`}8}Xt%F2Lr^4oT7_H+_%)fFdc*K?F}CHT zB(OPW|4}iYKbYoOb*Uijh4O=5O`I0gzB+g$G8Dwr;YXMKGHe?nV*q5LiXurqTKye2 zK+eBAIQ^_!#*2d+snR+y7b~=?oO-75kSjFbd!Wu^x^G<^U7`;5Jtt4_;Gz^ za|^H-E#dm-UHQq1ZW=D_6sgw*vb1^oVa80u%8V%)fS1t9bbwEgf}{k@DV&C$P36N?cc>;VPfrlq{0a=n;>ps5L*@r;OQ&?9quLO)7^l1Gcz>~ZeR|*v zL|GK=Qn5|amwHKli3OBTdzyaVt zY^b|J&sS>HdNCcS%!44tt5f!>5){dj2))t=2Yd_;*qD9elu4|0HPnZop&eV6w!-C# zgFVs?1HX)vmqJ=}W}!_sjnLfHpLgh-q)scXZyCslx&;t0&_89Iv!hOwe(t9}^FW=| zMHWQxZfY%g`BEsww(;@4DWAHYPtQxr0YTio zu9Khcfh!)B%=vWH4;#k1T5TP7MU2S1`x`XkDENI%g17f=c5S~qTm%X|$`rrLg~$G?0sWkv?qO*d`eEy(+!CcApC~55;!l=dzLXmsFLXAJ!Bf z?BE>pKG<(I_bEN^5X6aSg2`m7LK=gN&Vmd>wY{eEgEwy=Q#gd7%;ze-Rq|`yxJVp4 zGAB*5;CZm1Wc26iyxE@59vdgi^fwy>5C5o(YHiC8BSE!qO&XNb5g?7Y+^BtCmGZ6!_~AcG{y|6QL_K@dm1QCJmHqG-q!< z=D26mCObH{oUA&2ukQ)8W&g(9HPn9Q&j8TnU5+3_b#`coJm%aSXjN#=*|BOM;7#C& znu8Jo*?P<7Jg7Fryn3E+r+!(+zkuTTR2XR=goK%UeUei3F>6(2DoBPhV#{)PYN8{5 z?C@A`rLFsIAKilC`&B(d7o{O!C!Qu8IN7T394nwdh=c_nFq&pvscl5aO!(#&a_J-N zKmZ0&Z$koWBd-UatB+e7Q#q0KCZHtpOI1?Rluq*1uQLyf+To(axOCB4TRP>GTN%loA-;V0mg}!p}h)7c?|LOb6?J0 zQvy3qUJPlA(qw-f*@;uWSkBW+;#8gLNjCb+S`Ioa|%&|@%rDJ(k8e!njAND6xK5iqN;^7{o@!osM# zSZ-Vh#Wy`I55U-G6&h7L55s*6dldT?ZXH1(Z=Kv~dxkG7U~`Oc@7l4FcqGYz}b%}j;}Rj^S_XHa}Y`}ly?_%%*^1vX_lr8 zu4oiUKMhFs6PL$+JiANPv`Hemm|dUDc>W4YYT{DoQKb|lX|L>vKlMU8aDCEHJSZ;2 z{nxKgOSI8JsX|ZqbC;BAC!fn?LvFca$5P)of!aFVY9@lYwfQ2Q$7g>vX`NY2aiW#2 zl>p?0H^lZ@9+vZ!eKKxg=S6V3yphwb68wIie>qt@&@Az?hAdBo=i0$x3qa9Qh099# zXpEWw$$%jLr6dphxH~`Dx(9noX%=7dOLp@_Z#sZ#raPS8&3&v-Qn@TXSflaLYCs+7 z%g9c@*SOMYX{##i7lF0>GnG9Lt=!otX4BP7wBpusw->2V%`Ag|&)eXRvUA{+g_juQ zPM*21ZgeIt#(YCXN`Ky}njW1}0cce$#$Vo<&`WY*e%I20n^W}P;8%T9NZ;G?+>hsI z!H|l|wXJ;MELXPq4gBcN(q7?%6l6{6r6mlvO&e2#qDi`3|8%p6AJrSfn+?U#e>2il z@vj)aA3Gm?5KvZ`PC3-kWlr@cES1^ke%4ye)fbW8^AFMie_^*>)MmnED(gr+7cXD$ zZ%5&4)4|6eWGC{+C9@{3pAzm_6#LPYQKxuL1y7^n4Oz)Z$Tr$`KLfS$$%>QXWr;D#h&r zlIbyJf>!@6gp}=`eyK{heE;tA`CqTYGAheoaiL5Tn*1T4(IcG~!C=FrCU5&U8vON- zPIutGkBDrupHuDG(h?u`k2*WJ@yBBmQIm3Z(fl7rlpHRYU3*M}OXbK}F*JH{W9TGL zS|uvk^}*%<(oTc8UV15Z!u8A*!R5d>eN^*-vKJak+sbsU=&gLzNV!1M$)+4R{!Mj% zpehI@H%q5}ggiT`&zvb$072;asVdQKr?v*}CD&QX$c@b~bzqCu!U5DWN+U1dNZ#nJ z-v3(hTcGjih)P9}8TGzg;Gy*7CVSI8SEg}}^bBk$ z%_{fIWJ*dW)W=^#ZEhDLQd4Fd9<2+er=liuZ*@_SnzGyoZK;q++_JsK7%9-!I&#$tXVl51ecZiSvl{spTIU z&$OUmU%a?8eX6u(A4XdZ=UXsdAVyd%IP~tW4`bTY_KS{4*-hYZh)sXhF*b^t+`eK<#wZcD;YO-D2fl3g)(QavUQyU%42P?7g}($Hd@E3-x0 zL+!8p?EFf>Oz)0HYHvVXA;l=tscQL<8{pCI%{{C0RTHbb=aXw#>zuySt0k4}B5vH_ zIx8MFq&0vvI?q=vs$tbp>t&Nvn4x_7ZHSX60l_S0^cj5n|gySd@|` z#{w$MqlOzoY#KCx^hkZ|vN0mq|87{j^~T~Zwp+&n8A5s*~j1}gzEIfG`U#-UW<4!{6*r8hi zyHL{9(JXvk0j9ykK!3DXGGxJT*LbO7=ShbDQD;$PE7pB+RytDOniM{*GI5bPJ8bKf zLJgfcmWE&&O5sPYG|qW0Jz!;RUmk7~?@N1xJwme8+$`;?+CtEpVqV*A;CEVJff=pb zO~^D@zPqX6T5>F1E$BM9B=mIyRZTyG_B9)>hGNN5!3>e4L0(9dGfb`*+d6y7mOuM# zcnxbrLz%6eyy3aj6U62ku$yX;64JBId5iZvO?6EG(tVDhxYGh|@49xP*`K+}rQR|7Ae?yUj?GnFaNo30tPYwt62r6Uzyg#-j)&~(VYhRd~ci%d>Yl1xd{3IBG zHBQAt%+xvMNEa&XOD7oqm`>k9P8S!;#a)?yS9jFxonko?9UAb5c)DsyR}N?)MzA4u zDs5zWS3aCN#=aV*b2Ihxm_!Ob&oyF#?TIp`$j&1q|Y-K)t30*|f9r zt2GpZU)>yo$y_*YW3L288W-cY4^s3(%296EWtkdOu`+3FR2HN!lnL7Hb3W+G+_Ohl z3ZHC4p8Udi0>;Yywg{HYJ}p8B{p4>Oaku_1S;e|VDH+?sE;!7#6N9#W2Dz6C>Ceer zB>0Mn$t}{MjM>pUszLPCh7#{q`0Q$yzdkshQN3_2plM3()$4b#wZaJaPVIp@nQ3^> zAn-nU+p6N`%LZW_Bo`-^JFfj*5!*7(+iwB+xFJVMHBA0XxYU&HdP|lEqLUl7LBu}P zSAupYf&ndkq)7(d4LFxNPBg#^4kBQA*df%avwtkGRH9_oJmbg`e^jZ}nwQDHMobR{ zwQ9FORpH;wEl^umz^brOv)$)?xX$bj5IXRtpEL^)ztqd*>PdK^;%7N_-Z{JO)s4_R zA84btf;QM{#+ZIO28Gd4SL5fo6hp#eXw55Wko*n~952`(0epAx)~gnLzpN!3_fX@< zh3p{5;z@lqDmk0oTIn_sZ*ik0ur5C`=TNEQo>R^(VeX*5gS zS86k>?Ap%6qYItONhX0zQ4c!m#7PuV=d~dXoG$&+-Eoys&(+PHw9|IiA4BNq+r@mR z_9%6Bh0)l>jlE*j&Jcr+y%{MiesogTo+_dJx_5#;@S7{^T{j;8J zzrW)Ie8=|6!-57VBtt$O*fRUOJpNC{ResI$yKmZN2(8RGSavcYLJY@IihbeLk;;}i zrBEkRYSNp`pD#;+sM?>kKL}f$n1G0DUALPTuO-hXl$;eMJ>BXDH6GABJYYN*OTt)FR@Xz z%AMW90+Xp3t|6g_yx_Miu@?2#!8-jchFIMdUOlXO*=+Js=Q#!#41nRXX@)yEcly`- zenRd1l8uac;c0B%rTb-DS~|pH+)w;I>s(QPGsEgE%NcwF#y0nQ2^Qh)m`^RUGEbdH zg2Q2@P=`mCf<*-!)ycpMlv4_B0G7G-qm+_jhup{Y=epfL4t#*_UBF!z60? zgtb!BR%S06n3~O598|60FARk$O~%{h>bx`}8Ux<$7TNE&4}*6I!pqjFvj=1s$u)ZO zEsiX-zUWa2#r7(#_B-v5pg!MzRLfU=D`fELUaq6bUj&+*h5M4pEic6NnZd7xq28Nw zVit1ZX=km`Ec;Bqq!bkEqxwVb(*{lyC^kcG3@Q}|Z$bhO*~JO&ZtO~FXUJi&Dizob>-Y5*4>nHx+8b(s}+b%*xk0e9MQ z>`2d8w{l>siiTc@aG>Nr1odf$DTU@EXEv8C>YAA*x{!7jB<#oFCaKww4?4D%mmkB=qj8 zTW>t}UsLw1qjjiWbv+ksM$az2aLn(h3DO{?tI)o0SUTGMrNK?ucVJ)8Q!HH_Dl7~powRMV=2)MenP8NMCQ?8qdQXj{jxCM7O<8<|9X*ROnM04f zG-edC`g*!(Vk(@SR{Io6gUVQ>b*yFIHF9a&`kfdL4SjI>DkW@|KVGJ}zv__`bH5eo zc7c=595?erLA7)z2e`yFp%vb!9}A~-#QEEk6NpYRD%_}Zq)IEghHe~6Dg3GYw!!2P z>G7P2Z0D<60WnJfR!;50%tTd3*^p7iuvmrpzEfTNA&oJP!JC#gpxrzbx` zz2*oTV~Y>5y?j4vnuLiP)XyCXjAhL^NITUvoFDXT!^nFE?C2P#MQg&p>M{uo*wInj zIJ9j&PV~o)S$}uzNslBc&A(8qoSq;?`kf1Vtn9m=9?2DoT^wDH>f>$ol(UZdl!dgr zAG`bt%JoqrI^%}&SlaRpFI(QNUOAg$|AHPBa)SBjfy9n3bZ5nqJ9YuFktyERc{WaP z5I5|rB@x*Y{ZCA7U-7HpbDjg0dH_XHPxu1+k5#WV32Qst8;05;nhrG=+uHTA$#Dgp zY8Rb)X*FpmY9)t3g{&a_>VC|AK8wbti0bDzDLaG#Yt2~)XLmp1I3@_Qh(OY9p_Qy* zWm^4BBqzy;^j)F4oFF|T^YkvK%z@XPLYd1xI&5{q7G?C?h<#_;#&1`AsXLTB|LRRe zFe0!*$HK2&M!o!9?9}^?#$vg_s9}Xjl54^c=dp~$3xGo78b>0i(Q(84Y@8T$^KN*seMu3>|fdKN0s6rS_tk`YraDqm(~5= zh7Ua6E9M+14Gs1apNNvf8CD`&?lC?+y%po(njO_?%Ye+2IaG-aPy;``idH%riko(n z-WDDD9ueuTE=yve=NPb#b!t#%fQnAlZ%%@Bg%Ud7@}`ysAbG$EX&SYDUpt(UHyCa& zy~;cKT6Kh>{g}#w7cL))VOEd_yHIvVK;tUbB*kz|wImo++Hd#WD%(+9)RhJNQcG{Z zHoEHPAsyxi1=d1ne8aGp^YzaSvw1AmUdIbr^_f#fRt1qiyUrY2#R(SPd&>nTO&;9R zo}AAPu_NLwNLz{`1RDWiTZalopo7K&<18d9ed~IH8$>yd+Ujy0)xvM1c-{hoo@Fz_ zn0g;CWR}_LeD+9m6^1a}v8|nTeI|d)&5xSL(o$Sf21L%1)Dbb4CRON_W!iqi##m9X zQj(I_dAjE}2Q3PHx22B*hpoh~8ik`ZvD+H6o@yqV99y@t4PLRtnC+azcVmW)J} zM=h>)va8tp!HC+&MOZZP=im2)r_X6cg!gnA#Ek*U46DqtY@)~4Yo19vZ{@J0ks~aV z`9E^6gIrO;L6U7i_PPATRWzv4Gpp$#1NoR6m_#g~LZXSh$YFIEon*o{f$(El0 zk4jtT`HK%s^F&b7ELHVaHCIwz1?v?}RSH?aHrdTadvDdkh;Y zz_+}n$Mvt1@{6lmxwKD87WMJlZTrO>K@Is_Ac4PATKiHFcyA4uoyNarH+2-vlNp&q$n!?lfkIu1rJq1maS3qwnc(T{ z(Oe`>_?&`m=}vdBo-X>e_H5hg3-)?;z^pZ)N5_<=l0XY4AM;r3jIrpM{ZuAt8 z@V@E8Fa}7sb($StnVp~oz(hkctiY@_Kj$LTGoq@y2?NUY5~GI^p}+7m$~^Y@$k|GF zs(<`pB|J#g?f_pe9nKtOOX3i!D==cgd?@)oZh_K{?47kroE_J2JLYkKB>5)aLHQ{l zb(m$g6sF-IH>_33G14JZ&T_g;Nmc%HFDM7wx*EH8%>MVBN5x6K%J>C7Iw-+N-qlv>8H%C8U*FKSdbS_cB$tD(}>0L<6FEt!e{8rv1@; zi~_BFu5AK1A#{qzDI4X|3Ta1I5y|0t7DQmDS?88Jb+x?bE6j^cAWC#m+PBUng|Y>z zE($60+E0MyLvQSX!O7~Ng*R0%Djg>2QJxBHwOhTnD4C@HCPU}u{rNcUe44cLH5n9E zh>I;5ANS6on4vV59I>Mxeg~oTHpQBm8e_=4GWhVAPh$@w zI(rd@m@HSef5WO?*sKp6bZ*xj-G&G<0GMvDIH8!F!QU zOt-@nEFFe`t1JxqR{iWcdUGmy&$Np8{F?`jBRlm`Z*X^$IXENW(dR8Db8j_6w9*r_ zYvd^yG|y3fIenHnZ>^}@eCU>Qm-3b|KZSgfP$>b zF?)&=TH~5~9)1GQ^q~EbB9(XkM_$_p14QJ+AISd}ll;A+r%W1_iW8vWo1z`g!NESr zeKdsJJLCWyh`3|sI~CFh-#F*D-Yd0fiLcEWQ05DtYFLd*e=6TA#z^n?9Ey%nRYdUf zHrqnT?xcV56{NCuF4r*9di4!jHWWX6PLb|ri6nDotEx^bH*|C73}>&1JE-h}qvd_D zmr&iiqb2?k)-InOJ-d^bX)Uut_sQ0rFVCR$Woh|)U}XhqM{{(rcSue}3nEgl>sjQ@ z%j5WAEXIUbj~kqNa$7dnVq>CR_euZ_MYW)%>};9mS8?0#X3TqVD6pM#Lojk~EY_VW zxS7t;+1u!1v03=@o?0ctant45BemwyJ6!jM;r_ZPH?bz`SSyF~OZ^+dWx)3jt*#Wjo<~ zL_J5b>Ci2OqCq_E*tVB?t&eE6VI2?$g$H}dSFjWBuQ4_-U5yF?=9jW8XgUZi!N|eZcg?7QFD2Nd(-yU07ueHy3qfPASnl0*<`U-zA*TVH0hE;E1g#Z)E4rMwfvkq5%=%Ko)x+?)l+&XB-u+EfC)|O0@G$njoRu`1{}J|=pi^)ROT74wwGLM(Zxeo^m7s( z!IlBdu*xumFP3zh>{2d%x~P<#O1b6UtI>b^tGyqM(sQLnY~0YT9Sw)J)eARWssVca z)W3Py=Gv`*y$Xk)38;QRy&a7@)Z?;}_PDAXRHV zr_EbQg_3jBfVVR>Sf4HAMKMAxseB=13oyCY36oPQYaQIlvH4BHhB(kA`YI`*Q=(vK zlIp%>z2!fW8jC9mrK5B9z~xVy(0vLI;rOaCx~hwM%VE^&d~bbO)8E79!xy7%xLt^N z4ra`DqMxstT~2vy!JQYYIy7_!5tJZ6*&Z9tFW`^FGcn%UXlPK-Fcke1o@}{Sr}qeL zUiPtYtWz-`^46vG0xWO$*AxDMhkiKA&?ECVu%%q6nD+1Uq{IQzW0#MYzdpD~+Jy!f zp^8#vV)AfZ;v2(1U9W0teyv)$piZ2X%B|AE%>8UE^AoFb2t2OF&b zu*(&^&}FCohc+^qr?dBnfhYQdXXHYA*X@C2+NBrS#J17E?t;I5@gE_|FL3hcWo>{>mK*zNN^{U9R3#%1OnwtwAR zmzq(sz?!KEU7+A_jdep2b{tFq;?_Hj8=x8`)=pI#AWquS=O=rZgWr4nVfPk7iI`Y! zQZZ@DDfTOG&|~<&kApoEIXL48{8#c7-F4w&`V9iVuRjv*xsn=CNPvIu+(;=A3YG1)tl_n+-c~`gcwc!h*n|8wZ z1J+#EJzouW8ViH}Yxhn_*$1CdW2CG5MPMhud$O?}oj?qO9?N!Zt5hvw_8t$Tx+0oZ9^xt9k$*Xh69yVD(nQ0FM8W z0z8Xh{v4f>=V5Pe+7fVCF#>RiNChAH;c(Udhz1^G^ZGg0+HH3#FU99`uLKq!zn4(^ zbE%fEACA)a<;4@h`mY&}-A82ra#d2ljHp$lVH@`sKtIwf6@j&qeSoElnd4?FK*ffh4&J4@YI1RK~l1X2^*p*0rz-IIJVXeH)VwDhX(%)WM2o$4qzV zpSG-S3c>_g1v*;7H|ar1v~>Luw(Xb3rq9B|?qMr=B)1>*Rmsfk0R$PwANbnug}avY zvg6r5U1QY1!4Qv2yWjWbZ%8}qPD6^yf%Vlie5{9W{zoYKkTz+;;%jr?TggKT4b8wZ zHPb@$o;x)aCzW=C?IwG!&{9n#ZVNdc&}^Oe_u?_RLG2~itAd0fjR!W%)=*y&IF+l; z$p&p7|6o9VK$|5UgFUlTT6~J#Ev(i`_+fgHA#b(GFWU!ufGXvKT#MG=DIe=;nGH=g ze3|%f18U$Z?0WIjHi@%?KlMlVA-khAmudx3R7?d-v__t z#CpZNnWueNDJpgIe-PnF7d+3CZIyCPHa1RC4jyj1_D@$31B!y@(8w|4D&FNY?uLr8SGaZya`ayv zvRw0-O3%66z5@=Yp+yHXMYR6jfUG`64a+gOQ5$d)@PcDbs^UI|1Q`3e(aJ zT4&u1h4gPvKKHeG#J|elln%#(v|j+s?at;~8Rb8YYYCshuxGHf+d!>l=Wc{0D+$rc z4>@d5jy5a*VMwo;exFC$0KgZp>#6fJlw^XBfBQbr?TbIs!P>h-c*ph6CN+$*CzSWu zzi35~bnHKk2B-g%NrJn=ru|gDJex3N26C~9fyuRJ75pArG${FGKfO88+w488xRZh0 z^tC!UE3xmm-ou=kh)o)44)`-3}3HBC@h3{GtIz{wZ_^vux!*1p8j z$LwDVYShL~JB8Yt>@iCZXJ?^>b$s9Wg8t~}DSGNCYo=~Zb;k__2qz1!C6{OXdO&qB#;#i_|( zOqnDl65lvTIqI)x01x?QO23i2`5QnR?{Mu;UgSDCa_TFC3U&u%60n?LwdcKfh@Rvb z43C4&K0hz8SfQHkfp+NcW}P_1>|*g5V^@s`eYkQ8bP&Xcok3+*7R>357&NMY2syO1 z27-IrEX`&s_Q4NK!@2eCi+|jKa~v#q!bHQFrJF)BQk25a3olBy`%8!|FQv}XR;kh4 zNBi&_6l?0NqnNX|`)^ItW=h)+I~RMarX^qR*FRLqG<}?oda(R=DMJiPf0vjU@F643 zjOxDy;)r6CU{}F^yo!l!pWEH_QP@F(P*@S!r@p@Sp13kK;pRF~GVw{xbQfSP5JgpY1r? z(5BFf^-Q_}``^CYH#yJ2$u|y?{S)!z0M?+}!OCt?5nzuJ5Aa4OgKZ~R_nnPnsM5E6w%nPy_4#L+_fEwu%Hk zJfnAf1LxeU@%Vt&kjL~Bg%j$tMQfvQ&G_uuJHU7e)$Zkn*iK3$7}`egNuS8)KtcRW zEay&?ai&Ti?%>Z^KdYvyk2H47O!UVc`~>w$6@LRPoStM7HEY!P8)MD!Z2lzF>q7%x zfhhUQfLSHBMYbk#Fl+M`Y+enF3MnT|M}8nHW+0*5w~!~4s-|!M6v+?~126L+!rv#y5Y`6}7fR5wK=Z#i zvJD2a`Wml4PAd005Pq?FMUKMG1c{(g!21aJ5R1y>rp`5ciJW(40*$z}a@K*~<~T3k z(3g5u;1}gl3cpKcC!FTnZn@A-DnK24#sxs?>VPjRDB8UfoOFimR@&0K3gjrZZXp}5 z1B*d|J2DjTk(H4l&2wI4uwip9y^fQi%&i|gMxFs&0tCo1G^}tR^;g7l$DaOc@T;FV zl1mHsG$V|aPcG;UfV}Tl7Uv6RXvQC}TObkVmYbyUquJrZZRB`lqi~lBG@dRXlU9n` z0TYFj8n(X{6ps~Ou!K4oOsUZoCHF5+v9NsWC@0{P1Pe4Vpv$y$;ZAz%z}k_f$ci8v z4EcITK4o=dJzZ|^-L0Q3+(|>+le4Gz`}aeQV((%C`_V%ITW(o3+{~4u0|nH}E#`d>+h=oR z8x-VW(L z0x2bz31M5gES(X=Zt^s()s7A!`vHBWEUv1b6?3gT)|5~#SnyoX3Xw`n>shFkfGZo6 z-jRMd9@6+WIZ9U?=^}Uo^xP6M$y35uuW&juSub~}1MW#caFrSJ98iBrnMGq{o z|5)ZTl~(ugR&IxLJGyQ>r1viy1I{g1Fk_ZhzLHe&J8v^Wok!qiXZD{My!LTJkTBIc zl@2YD*#oo_>zsaq*xI9(<_TJBc+9W!eLs7p*N4QAik>^;( zup-NvT>clreqE~1MmFjvS7wPYGy`&BR-;#6Jc&z;l2LCfB0-rCq8WrHx9@gb5w%}1 z9kheJvcg_kKd#FZb%HO+(52oGe;7F!Oz%kfno?FE1-`l-CpG zA4fwjcbpB*U$Z}6J-LegIB9&a5?Xall+uTaET=UwOZxqPGdr(cW?B-^w}u^(a&#UJ zuH;dvzCnBIkuF}^zoi|lq7R2icz04E=~++|A4RfK%Ngwci2cp+W*w)w6fCy34^s7ad3FX?)tdrI%HRujPMBt2yt4c&^85hbjege5Ny38*F?RLn{IsE*;vBj!N3*iG8Wu ze5o+?tDCDU;0d`weZ{lvrc`rN+pmAoO9dU0x>CCr!I}as!}ZhM1)^KoPOH$m-C46@ z*NxLtrW-c({729;PusD~*s|EiqD?HF*=eb{7vB;;e4q+L@ILh2NNGkJ<@>2Dm$3%l zBTzZ=0iXr^?}sDyRffi96>CK9y)_&6gp{S<7YXFO#I3&OV0OQxX0sP45KF!$1K#3( zDXrd>NY|-?8E&Mjwtxzm!$_N3MMs8=U6=zxW}>Yn?CjYlB@a|^I}L#i3GWcIJK!Pw zz#K9z5O^w#A01z2^~K=&Tb4ZMPjiD0vLVZO)lGa{Y6^UHq|U-{pQY14LzNwMPNfnD zf>lHhY;Yo2Ks#umZ(W}1Nm*r(A4av~)iQ0P2Uz$;b4fj${KbTU5e-Sc2 zP;pmrdm#7oeY6)!xf+TT7By~8+7}Y{`5!z9h7V2A;`ClMq9OgCU!=X z$bOm1267{fGJ15m#}2BbOjLT3&-tRd*L_XNkUnGOi<=AcKb|~(`C!*=h~20cRc@}` zuR54+1im=L)r4xQB-h(9Hxes(>GN8gzXI=!WEVeIY2}`P$isX1an1| z(@RP5jfT%cq5_3l7C*^uur^6kA^5)FY!4fTEH_;H#ZHvXv%*eO;4PmDUKnUdwn+mc zVUFOH(4*(`sP7*A1020AeSA6q77kS!ef1*W+&DDZsNN#g&)oBN|La{f`lZh(AJywJP=m*M;Vm?P4m9Y^(TGAR*4&%pRVLu1b0^dYgdNF5BeDd|#; z6QW#&K~VlG>OmhKA)3EYQNNb9cagQ{F<+qHPz{@IoZ}GPkg0Fb4gb$nvl-Yqh2Bjk zOrI&}IIq5$587{vmC%Y*T;~P{*5yY%{o?PZE@C!v4C z{Jh10l;SB%Fntn|FP}2Cq*yvV4_}$GLhvU6f%Ur$4iP!}JoCk%8>jc}uol#13n>rl z7fxL-im*tsys~-64I=gy|Libq6z6{z!3mm8*l>TYRQY-s^RK&9k0N3VjNxwV+Tq zmwpyfbr?n}ier0a5bQ{Tn{}=f8V%j;m`GQTvXpJ7K#bA;?@W3+;ycN}wt+^f?g`Jj z9Aa*#5(TM@fwLyIBcBidM}3Lk2pg*@==P1}#} zu7>K8`j2=&tLtwYgHl_M*F#C%mdWbky^mpcHZAk}^%|(V?75_D41i;`6u?l1;Ib|{ zoEs-z*n8gAUlYtxuCMEm<2reYMb&dl_%-(d$%TnEky34xeE)VFNS6m6;lzg0)6LarlryP6R-or@z zf@sc{nqWR`ODE{aJ(?5(Gs1~r4}yl-x{DuEjft0`%NvuX&#O8QRJagyqkH?&MOs?< z(nvLhzUTGpYzQJ?w)!BjP;6UPdHiU*{BfkKzL6dGa^fVucIX6E{8Zac!FNRr0yB-o* zvo}1cexoS3j|Qo;>@Vso?kmIB=D1uE5@^$NycP5cA((@vkrkU551~UGSaqAD_-0L` z>y`qDY!zAU|GF|&ubqukG-dX-pBB!4fbcdU)`{Elfjz^(&$MuM6Z=Jg+VM$eGtbpi zOhw*3mXln9zvMJWjig@%D|7pimS*u(M{Z#LxA{u(0|E7omAbUd;N-oUt{U-e^IJRl z{&GyqzM80r0C36Gz~3c5dWO%LXl?IeB=!L(tQaELIk(eK*|Fj2VL*#tP@ax81V^#n zq9LZYzvymYF?X_UR;f(exzViidtcpJO(}jHf%3th<$G0BX>~{#ncMe-ABEu^pkmIlB9QvS_S$`XyZw;zASFT+2X6kFH1zpn zid665yy@T6vcGveqe-hO^QM>lDlmzD?ynL&R{eR$hg{X&ekoY#URVT!VSL!u`Iwa2 z;<4)MHGn6fah3Aay&@S9JpOY!s&yGT7G4?QxpgxgB#xo zb$r32e0GL%ER={Ds)M9GuwD9@A#@BoK@Hy8$A370@O|;4Jc>Qzm`|w zG9b*P8vNv2iX=qZv8z+VEvpo>Z$Kg2q@Ca1v-AqHAGw*53jB%AdT_O9xq4PiJPlXA zgKMDA``tNEa~^Y<^g~r@gjj#+;i_8j*K<-c==_n}6-7S%7Nf*IR^L1A~*bOOtoPqG}I5!LSwEwS%dk{huYAxxs4=%ED?!*Xflou+$ z&@UXvT&Bw*nyxNJmZgE1_3vRg>-(*)?U(?7b?VV2vxa7CYK^@YcUvtm;v8{~TG-<|9vYlF>{! zj1;xEigq0!wWq~8`FT?!jqKV~HpSq0y}x(cNOH;r8J7g(7#t7QrlQh^^LhQD-~3Ro z*n6DU+4!eW_)ba~7_{xr9|Jg~L$c;m0P?H3lIzjiBN`>oFOm5rMG-a$(-o+ZKBeEu zB}m0_sQ}=yX_GY24^{m7+ZP#`~`Lem%3F>pUK}xti2vX)Y`$6Fr`!gqQ zn*O6mm*|)ZMd>NE7%IF=XE(S_NHQu=I-h*9wJjG}nHA>*gg;ffHY{ZE{f>oEQXk4= zVW7?YRUKaUh!HV#yn;2WjrX8V^l`<#imjLr5Cr`QJ^DzuTJ<)?*d;o8!ewX~M#%1< z$yM!`_7A4Wv%DOiZ_g6iz?t*=x)qLVGfynWm5dOLJ|jJ?%MkL94?fu2w*fH6?63m= zOW^XCq1H&HF9>G1s3eAPdD?xx7_%Fok4t*Q5lh*t$a>@blZWGkHa>!<;S$vIJm?#< zf~r*bj*vq5&gk}i^)lgAiO_pF`j8Dm><&q3RrOFtvdnMr$}zS8XKK2Y>Zv7DNI{QO z0z*YOdS?a@u8&{z_#w)wK8_{T^F@>VKQrW)_k^ttPw!}QVTc7h9q-vHFEMp@ zZ{e7Ed%*;I?4lff@9#I)m@1lF;-^IHA5G}YoF?WILf4zP2zE>enF}r~dp4JCc-J5BXhmOL2u`q9p&k{5K0}`fSQz zteeKj9LJ+Tji7fp4dyaQ<;>y_ry*U$PrTus@$H`b6!VPn3_2`K;3L`4{Ca>NHxIeB zkuZ#O33vu0QctwW^C{`H!+BJ=R|(X5e&W-+>;7jZNlP-dW-wl zCYr2UYOXTpuvD{53>kn(pEV+fYk55M_wD@dt2U{eUsd~VkPbnn?mVf@JbJPJa%NK| zjDkWmX~PYC`59OzvN$x~qq;JENCUv~o{xmPxl5|%Osorw$A@F~=H^gh&1%#-3< zF7d)YnShSH{Os~MYv;q;=&Wsf$YL)Yx4u^l(&!M4cpEbOUQWM0r;TXNz)+;^atkYp z@x9Dq7N&U2jv*FmS)-~#QEaMrGv=|vT7fRgmAyc)pdXWeX7=X zT?VqwGfpfD@c&RqyAy3XRdS&W{PuQMcW-M+mv_X@qIpHscjs|z30-X-bu;rkJ3grf zff9b|><}eVwE;a-6=(kOE`pC|>jC{aj2k`prbZ+c*`hazTZ~`4EylPPBpP^~bf)V0 ze6yZD9iRF2X~Fa7N(jGz-Bp~T;{YB|TVhDlGiFE05s!>rqP5JJ&j0S;{yVUn23DvTB*O?_2bT@tifxn^x>&R?wPS9wy0}8{g>Ta!>oh*eRVvJ zO*ASM7(rGp@krM|_%||0LvT_*c`jGa!kfG}aO^y)zhTrVeQwy)8H(|i2;o{81Semd zJZejU_=6N<7pS`30>DcWeG}Si!>7YFSr~D6vOTK*q^w$dvm*t={bPn}4PkQ+mK5F2 zPJ^Sjm#Ibeu6gNO-ZO?LTV%gF?i^80e!k0Hq>N8ZyrnKzl83pFjfw3eh%7Ih<=iDY z8MSGV_{ghZ=0+qhlUVE4h~KNCqS#E6ZS*zO+Q*6$88U;k;U#rB&Yj?_@TNOX7Ntf$(pdawDc-)_mmoAgOUzySf<@GfA=uP zn>+TU8#QtiGLk=XsR(hKSEp_9%+rDMGML@KyNgk4X@EX&OkMogbN`2O1O3y=B^0Zy zUw>xLO)nky9SDZ|WsqlVu`yhTe4y4G@gnr}D{+*o&Yd>MBL3?= z^Z9JzAUZQuQ`EDUV?yZMPNAqc`EA52HHzFZB>@DBPBJ-K92;3>_yL;dljP2}T&Mkn zUuIQzyaqu3seXa3_rn26)roF!ixq?X|wGccU_*&st4_af3VY#$_VlMrMaGMD4`fkxs!WxwMpt1l+F?M z8%i?geUnE`C$^bEvl-1vTL7}13p2o~_UbnK+Wok}oK;7clhg8kF#CI?Y4T#43GAjT z#fe+>w&IV@vMI&L>X#}}2)28si$~L}1USD*Wuz|7iN)n7!hG&JasKIZATM3Vuk^c) zY)BVv+VD~ZA$UE>!h}MOF={_1;$4E+DieOWi-{jePXOoF9wzouznQunSbU)VHdlA^ zzI=tR?)P7515fmg7m?SMp;gA$-)5KYb)sv7x#sVJTcWbhl+vTY%C%Aji`mG-r(Rj| z`O$=r?_eqfP}kxSxWQdaT1#=q8S;Kd-WRBri(x9L;O3g}cLX$fZW?zi`5NU>$VM8R zrQZl}w#kxUAI2b@&mx!Xk?IN=>a}1&3}KiB@skId)rrLKu!4h{@ToHCQz;Lb@746< zf=#8y4(It1Vuv%FR`_MVtNmueO5Pb0R#+#yB=yK$^YG%gYN{3#X$^6RjUbGxBJ2mV zNlFsh(4osSJ8I=A;V}7^z=^f+JD1^vQT_u;I8^{Z6{b^CYswKu7MH<`!bx;T2s2G)a^(hHcQYP~eZ1QbgJ zXSUnW(yV-DV$>|UJAo&$#4(nsMpmuYGoiOyYac^=!geo#vIESvu{kwDFj(5J64Wh>c5`GoKYA zCk&q(k-=RQPYiemu{10?(BNtc{v4EWwYN2`(9a==eA!AUyKNd)#<<_pV%mu0?EuSInX%{UeLXQBkv_@D3Ptp{A+T-K1^@&AvUZT zuXjU6L&P!292#rH*tbTRQ8 zpKshjz#|fb3Hk;<(j(=p{U>r4mH0a@NqT9cPWEjzoukuw^cjtP(138}YJ_BX+VueG z3cHmQ40)i@=?pY7T7g0@g*H{4hM*NylGSF4oLXD66V5TPw}9uJ@x*)Cox3AOB(E~w z4p4mreuoy7=DzT)22}<$3k)T4gu9H`;)uHwMH$xkryPBR0*6%f8f| z7iaIuJ*FT}IM5)1v!Xv+DG?tO-L1Rib2HL{Y!1^pDtZoq;XVg5G=v) ze+ZJ~V}zBBMZ;dW-z733SYh>;#c;=yEPO>&c25l$@#VoQ7{mwe2oa9o(VQMR`H+0l zIXof$G+mP0Ue)P6N~08{vyvoiCum~8J63+|%U9Z@i;H>jkC&6HR+zffp**0>IAeqL zUe!g`<{aeCX;AE%4ZI=iw$lpWLwwbhr*9M|Eb>JK8F10K2Db57zY`8O)mU`cGXZ48 z(!j+RatLR%eMUwTU@Qdgg6{}jUnN1g9|4#4uT?;V;UiMO<^*ccswcs1rtZ#BA8n7< z(5mHViot3}PY;vG>JnNKi#cHs-^v#Ajn+P!Dy1@kYBg{&0iGGxhT8N<7)S&G9H5A0 zkSag~fh&y%v+{Ib>-P^8C$IDQ@>&~Q!OaE81yT`fuaTA1I8BMGtc2`5ZUvXfDG- zTGjsIDLQNSo2Fi)Lt+rK@HW$T$n6mHZ0GVBl*whOx6WYjyz~5%vW)rgH#ES>#QVbO z&J1RKk&bWr>?v%(RA8|6edAz!1I2Pxk{^O;4m$4hJ3+!d#KAbg6vqNUNA~wm{`AJ} zu0}wy0tRJ2NvP>OHfZpfNQ-PiH5n3l>MF^Z4y&&H56RMoTOGM3H^*j@Mz|jMy9$My z$}BqnbDzThL_#o!9qKZXt}p%%4kBC)5)|XEg{emwb*$)N?gGe}-?e|-Q{DTPlfs?g zcc0hkH%Pi^5~}@Sw&u7dVY-zM5)8a9xwmW0*U+>Uh&HSP`L&sY;DJjA^KS;LQt}d2x994HK zVpXRFPi2;snKQ<*R||IOX_I?N2DmK?^`LeM$O{Ct0|N5BO6YJrwlGnk>qJ_ABxjb= z(T_V4_0I}}A)H*=abu1Ka7=dlt}7abl9J)oOoaJ4v3#8*1V0b103i|X^l+;!cZBHY z&z}O0Ms5Wu9Nm7ZqIbPIF{*qy>Ye&#xqVEBsF(7T5bsle74%~MtdCm8P+W85sWiyH?wkDzT>&)#>dd)6o?k;+r zZ&d81ZyDr}-D7o9@qm^2Qy>-tajS*U-it~Jmq&#Hz_!eoA_L`lQ;L-EgAtbxMKYE+ zzFQdU+OYGYDWaWn$dCvJ*l7uhLhKAr)722s^jud;B0D1308rCjw8}di3J(x*z-Phl zoPdq>5F&IONqd==VKeS+(QB%6Q>4ty=VW=uLa(%pI%s z7s*nxwds2#x*764K#!3kK()lf7Zw?(LxE3wk3%gUl;k`G+}K`l?I|F|qzs>6zcg!9 z>e)c*b1n%v=`7Z;cPY&eZ2bLCBSJEe2YX(jqWbjpI5ep^ILm?G1b%23k<{N+lZnJ7 zL!_T~H=l^`oUI?cO0`v(`sLn>LlkvOO;&Oe%VHq1WONAYhw++3z_W;XQp$wc7~(|- zrku23l?Am&fFx8L0WSw~$CCuPad{cTE@q6qm57wqF+li{aFnIwYw7abJ_WX!~3&Q<{(7ntlmM&Zh`(qh{2lqFxWs(HqFw{&ol@HFlx|uvk1lSoPi1S z4R6^h+0=_&>Mh_>I?>BO9Mv}|H6Ubs4p@5+f2r*inYHW&;pp1Zgy%*YFjrR+#?C+^ z9wJac)176$r8>=X&d3l`YMcb4>8EOzpXMkw#t}uo@R`oX);gv^MDU|0Od}<#Z82uK!Ir1 z^WCQk*WzC6wAxZ@M_9=57!I}z_Imu7uxy-YoyNZ@( z;_1*2UOe(xQ+oW`qHsj=-Q4~f`lFx~*TPvfi!FiHp|V7%kcZnYf@#xs!1vEwxM`R; z@bhbj8xNhOxA`9GdgfXg%*8dQK{Vm>O~Fqm6&0T^PM=lC_GngVHEOt~RCO6)Nb9hE zRwjTfvYTqcVbe3w1 zdcrM~f)sq!f1?nb&_-2Z@JvFsPR(AMqN_>M&&p!qJ6UiAvbT2~rs>0LLt4PnHV+dk zM97r$A%kDw)YB;+cjmP6k8I6|;F@xBU(7d8?feoDNo6MVph{W$BVn;ZrB$f`AM%0& zY8IT#u=JO88#0pJ2X<3c_iZPE=T!Aq7-@djii4AKY32+DH@*k1DP!mkUcvu_N0g!D zVJb)Rz~+nc=CP(xGY9G(j7>DE67GJhNi+#&tK|T+-f;Hw9R9zQ8 z&2c!ZVcUgO;0?)O)5tO}L8v$nu(VX1rWse^8nL<<6U0h>(Rn($tf6#W-Of;Q;eMm9 zq{zy*Lh3FYenIA2w1jZ`4OV*m-w7|Cv+Tp!tW5y>#tSdii3ssoomC! zMq5aIbciYnsv>tE({#sy2tT+~}S7ISYoGped3t!68q=YC! z_qCQ7r`WYl5+UL;Db#e*qkjY3P9Y*pR-#iH?1&SWht%SbX?0ZG3D zD*0#u&3z!mIn_%~f?&wbmEX#`x{4BAPe;Db3UDcB19hK~6dH_NTfG`UfQ4hU@wV%c zO^0Ak`^MmjcU_5q{WX?)N-*Y52X7u0A3K*BO8UU@V98%n4A@4_7_*z^JxT(mo7Mf{ zviJ+;?ZiFz-|nThN10);c!*Id2-1`YrzXN*bCRjG`>wSZUZ$MM|D2;a&X3%ti&<)s zWt{Ylyz>5c6jQ=i8vo?qLH*25i$|m` z+=fdv6UQ?TOG#>3d{`zgDx7ziIU(udwLCH-LrGd^vv2eCZs!qio*slY*RN-(@RFcX zG-=T+vd@sIjO~lOQ)W>d6TflRqU166`j{Hx&eO9Knqqh&@%r*o;u zPx@K4=R1sAG&UPIe+o7=3C21t-t5|f!Ql6(@Nc*BsgSO?!M`H*Dq20qHmm3jWPP!t z#1nq;i<}3~Kp)BK+eH~KB~^2M)CL=jWL3S5zY~gj@Y%YbDc2IY#N2Nx2}U;P>syjc zOD0G~A3MKPza!d>^^dKNdg0pw5o3MOx1QpXEoG7%eMQ;~4wK88xk5p3MkyuFT!fP8T{g?v zfnXRQ8&{@vO$AR6bA)_%tSM^VJyt4H*^Dn*IXk+s{U)k!wJ`Nai`rL(FLgSWaZaCy z*>ug|HgctrNF%VX%9c7GTIM*8<%G9c%!-~bS5a0?@JyWhIKF;!Qv--J5Bi{YJ~_H| zmrw#H5dQQVlvKQ;A8D-}rJN;O`T7Az4mJr6#b`{cF^w>@taY1jOZ`}e8Aarg#zLBI z-|eT<+x;}V5Q*kA-2G3}zqPHG`U<-E`KP{Qb z14jK5yRvvxqv#!>8U*pXS|@i>m^6a=%48%C>Wf{FB0sk@t88O~vo-4(`-~)+PtKa% zaEZ(8eKLvD(%4+j6xy&ctabUF>jOnNH$X>~CX|szPO$w=)iaUISC`dW85(cAhkpiE zU(2$schRVCX@jeawK_e7)L^}%w-*znCB-DXZC z9KB;JL+hoo0@f355tr$*dX-*+UPfF~)ZBX3*P!-Li9VOopIlZX$NchYePb+ZDAd!``Nm4cL@`~#%;I;#lvd>lOo;}5fMY9tkc4JcJQ4;Yz#eDhqw%_k(be+R%ez_T4 zia4R(syouf6^?~`0*n!W_Pfz?4nd0V!UQ9ggtJz9=x}d5t#S;rE$+<}C?Qu7JoAU| z03=Pas;+$-Joq#DM@QO7UCz4v$S*;SUn7^F^oDUQdaleihp&1n1#{yK07OSbfb;%3 zv0M&{hs~GQL?;c$xj(K;Ny>r+KTtmpnt+T|is}09ZwIAw9~BuGdJ+CwQjDPvUo>s= zipSiH`3;xG{Kju7X)U+sBx+MHcRYCc{YK93!TE4VZzNLssetT`%W&CuU;DE0B00WG zEK4Fp(4Scw8}LJIWm}yANW@YsPm|Si4JL;?LJ4nV*u}R>y;ivEMMYR_pFLNzemBf3 z&;&zR1yVz$tJc>BD8Niz(<+_y8lwsdwo{VO7c_l`Jj!(TiRXee9RK~W;#?OcHQ{LL9`Biz_1T6;X9Lu z@Y#1Lf>S`<{&K(=6E?s56w*XZxi#Ac@*`bTCZB)2Bx>A2B1KSIB2ix}Fd2dEhrw`EJE)Qz7^L-}h2fg(yab`4-^M(4?x*Nc zk6(TlR8_r*Gi5-`#xl46gvZ#P8Y?}ccLJa^-7y1YEBw!9ru&*Z8zd?sPEKUO)1(x{jr^lFUr#DkgI0>+V(iCYO(ss`Z0 z-8C<({XK6a+597Bu7VetLZM{U50`kAM4rNlE$Zm9H+x_aWNwjXw7-WH1NIfrEq9|0 zjj*7Up#$vH*fqHyVbtZ}`Cf_DO{cA1N<2PGmBv@yiy6|yj7cP>3Iv#)Qoyv37o}f{ z5akI+hjSbrrJ7eQaE1O9`BK$Buc8_16iC^&mjuzv1K-E$Fy zae0T$WYz3OIq!1{9PA@F<&mj$LflulIe}qI%jbJ@DUDLhJxX7RWF5j7w`_m6l-j=6 zRQ>nrtm1~6gkz>Fv*uf4U@uqaBg#mGy@^VcKTOSn2^!D(z|ea2oX(|^PEGqElj6^8 zT*gF4Xc1jAP3nkz4h#G`vop-U%Z!j0Mt6Q_A#|lUm5-Y36 z0{%jh(QmT1!dI}n^rmx;O`yyz+tkNifj9sODiS?!J9A~vH^k7{94D$u-{$5N7f-N% z9$MsiOr40QC4S3mRVNXs!4iQQM+Myf}LQ*9Cq@|^| z;alxfb?9kJ`{YEkOjtCHJO{O?)-M`hjD97BN# zk;D@M=RZe~TL|6={v}Gr8!ktb3Ci~ZQ6Xymg|O@G-H$;^fAL)CH&OEqAEoTuU-;4H zn%?EZ(d+jQz4S2I4j<2TUgwzpiD8Qz2?`TP&j5^w4L$nad`jsd*sy(Yb{%<(brh|~ zM8rG}VW<~xL3MwCoj)D%$`i8oj?3lW7}p~U1zzHcF0v!`&WyF4=|ApAgfFb{Z6#o2 zuW10mY;N?b55~KXY2-0uL^UhxCUN|1zDb{8#R@)P5=k)ztw-mK;7=#o=R)%hzIKgQFv9NpI1&cp3DsroPb`N06I!MXXtrw$r2UqKQs~QkrWm z((ekQ%I(Q&U&F)W<0Z?SlGOV1OeH`4u94YMB5%G3)ItQrfV7-5?D+<|y{*YUqm;(4 zpP@o*^F(pjvrtulf)+WjRh2>F8_yhMmp1pL$2zdYv`D|<=xv*~d(21UHKtLV-0Uvs zYhmb3B<=|+ZZA%aEpwU?D@RmyK%h%sT@*BdFTv}JDua!}dy@q4PhmbhK`kwf@;WyX zN!H}n7rsA~%k3Sk?){`FLou>ZhAE_ItNQz7c|%!Am=CeqPowZmJ9+wRXXZTbOmhrgXCe649WQ;sep5|?g8E@e$w|ptjYfw!$>ds= zIWvnGQe*Xu2hWD62A9KA3PP_gSgsQL^Gjn~-}XG$~P2ocYR0_(1Cb2#9JS z(*q^K84sfhp)0{)GJx)-yd zQ7(hQ9zYkpID%Vh!{M+@y3=rS!eiWREomx$6{u^t z;bw|?=*{@aGZx(-3JYuUvsI?PwAhG)AW39~n-I;9F<=3aPY5&TlBwwb04<`!Zdb?*>gG>!$n41M|H|(dx-h;dO z*CU(H6+>cUYmaw#{42a!NIjV~mr|Df3Vc$G#eOSc2h>Kl+UB#Ei9ovblCor>LN{bIZ1VW@0;!Vgvb|2%{z zMhOCJ9hNETZHq;({E-{`K^hLczL=w9W6|S8WvyX7;f8AUe;#~OSu_OKH8*->HW)=b zMZbivpTjY0en;OxT_-Qr$b4FPDNtcVHE+yB?cYCw%*!`L!{oJ z)`9b{L>11z9|8kPQO*;ooqfZvr&j)Ra1b1}>+rv<#$lEC;NS5@n>{#p&Pk8v=V-|G zKhFnUAajLzaYRx`W_+{w$(v#KIy#!%qyBU42y!3+eZQzrQQMP}PS~ z%Epc`^ooj%YW(W=T~ON}TT_)|N7n(;`#+DBCcKB>Xu8?_<^1g@$4*@M4>iGxn23Oj z6Iy#OL*P=><&A4LwSGZ6oyLv-`#nM;D7_j8S#Q6OMIJKGY0J8}+y_Aa|2+7YsCa79 z*PYr9( z?`1dO5W&x*2KP*vks%Sd)qg)&IaD|htA|Mww_mZ$3v&MN{UbY7|C?UuZ|u-}`LE{< zy8r)rE}*CWujdHR^1q*ZK>GiB@Tgq;*Q1Hrj{kau_WqBM|Leb}puF&3&!6r5ha7*X z^B-FL;hq1`;{P|2^`9gE`L{n5_(Op|6!=4dKNR>wfj<=Ze@TJy-z(#IG*Hs)`rSWE zcjo`)+JDyiLxDdO_(Op|6!=4dKNR>wfj<=ZLxKN$3QQG$ibfEs;gh;rf4tNGd-DDH k#UBd%p}_yADKJN&uDL(qkzWpUIwHtPJyYE>ZEW=a0qVmQf&c&j literal 0 HcmV?d00001 diff --git a/thrust/docs/build_docs_locally.bash b/thrust/docs/build_docs_locally.bash new file mode 100755 index 0000000000..d77bc97cbc --- /dev/null +++ b/thrust/docs/build_docs_locally.bash @@ -0,0 +1,13 @@ +#!/usr/bin/env sh + +## This script will produce a 'build_docs' folder that contains a jekyll site containing all the Thrust docs +## This is used in CI to produce a site for Thrust under CCCL + +set -ex + +SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P) + +cd $SCRIPT_PATH +mkdir -p build +cp github_pages/Gemfile build/Gemfile +./generate_markdown.bash diff --git a/thrust/docs/doxybook/config.json b/thrust/docs/doxybook/config.json index 56b7a238be..14c1ced4c6 100644 --- a/thrust/docs/doxybook/config.json +++ b/thrust/docs/doxybook/config.json @@ -1,5 +1,5 @@ { - "baseUrl": "{{ site.baseurl }}/api/", + "baseUrl": "/{{ site.baseurl }}/api/", "copyImages": true, "fileExt": "md", "filesFilter": [], diff --git a/thrust/docs/github_pages/_config.yml b/thrust/docs/github_pages/_config.yml index c131e84fb2..c4a48ffa0a 100644 --- a/thrust/docs/github_pages/_config.yml +++ b/thrust/docs/github_pages/_config.yml @@ -12,6 +12,8 @@ search.heading_level: 4 incremental: true +baseurl: "cccl/thrust" + # just-the-docs ignores these filenames by default. include: [ "contributing.md", "code_of_conduct.md" ] @@ -19,6 +21,7 @@ exclude: [ "node_modules", "doxybook_templates", "generate_markdown.bash", "serve_docs_locally.bash" ] plugins: + - jekyll-remote-theme - jekyll-optional-front-matter # GitHub Pages. - jekyll-default-layout # GitHub Pages. - jekyll-titles-from-headings # GitHub Pages. diff --git a/thrust/docs/github_pages/_sass/color_schemes/nvidia.scss b/thrust/docs/github_pages/_sass/color_schemes/nvidia.scss index 4b44fa222e..6a63f85e2a 100644 --- a/thrust/docs/github_pages/_sass/color_schemes/nvidia.scss +++ b/thrust/docs/github_pages/_sass/color_schemes/nvidia.scss @@ -31,6 +31,7 @@ code.doxybook h3 { margin-bottom: 1.0em !important; } $nav-width: 300px; +$content-width: 1000px; $body-background-color: $grey-dk-300; $sidebar-color: $grey-dk-300; From 1570b18ad9c5e69ba4b6dcb6db68c9f4784cd036 Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Fri, 1 Dec 2023 05:41:21 -0800 Subject: [PATCH 4/4] Update Docs links in README.md (#1169) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7171af7330..29848976d2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) -|[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[libcudacxx Docs](https://nvidia.github.io/libcudacxx/)|[Thrust Docs](https://nvidia.github.io/thrust/)|[CUB Docs](https://nvlabs.github.io/cub/)| +|[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[libcudacxx Docs](https://nvidia.github.io/cccl/libcudacxx/)|[Thrust Docs](https://nvidia.github.io/cccl/thrust/)|[CUB Docs](https://nvidia.github.io/cccl/cub/)| |-|-|-|-|-|-|-|-| # CUDA C++ Core Libraries (CCCL)