From 387e1f520d217b3c041d149c1bb73d23256892e5 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 30 Nov 2023 04:07:23 +0400
Subject: [PATCH 1/4] Port device docs to rst (#1160)

---
 cub/cub/block/block_load.cuh                  |    2 +-
 cub/cub/block/block_merge_sort.cuh            |    1 -
 cub/cub/block/block_radix_rank.cuh            |   11 +-
 cub/cub/block/block_radix_sort.cuh            |    6 -
 cub/cub/block/block_reduce.cuh                |    2 +-
 cub/cub/block/block_run_length_decode.cuh     |    6 +-
 cub/cub/block/block_shuffle.cuh               |    4 +-
 cub/cub/device/device_adjacent_difference.cuh |  871 ++--
 cub/cub/device/device_copy.cuh                |  209 +-
 cub/cub/device/device_histogram.cuh           | 2042 ++++----
 cub/cub/device/device_memcpy.cuh              |  213 +-
 cub/cub/device/device_merge_sort.cuh          |    2 -
 cub/cub/device/device_partition.cuh           |  843 ++--
 cub/cub/device/device_radix_sort.cuh          |   21 +-
 cub/cub/device/device_reduce.cuh              | 1527 +++---
 cub/cub/device/device_run_length_encode.cuh   |  472 +-
 cub/cub/device/device_scan.cuh                | 2815 ++++++------
 .../device/device_segmented_radix_sort.cuh    | 2177 +++++----
 cub/cub/device/device_segmented_reduce.cuh    | 1352 +++---
 cub/cub/device/device_segmented_sort.cuh      | 4094 +++++++++--------
 cub/cub/device/device_select.cuh              | 1223 +++--
 cub/cub/device/device_spmv.cuh                |  246 +-
 .../device/dispatch/dispatch_batch_memcpy.cuh |    6 +-
 .../device/dispatch/dispatch_histogram.cuh    |    4 +-
 .../device/dispatch/dispatch_radix_sort.cuh   |   36 +-
 cub/cub/device/dispatch/dispatch_reduce.cuh   |   30 +-
 cub/cub/device/dispatch/dispatch_rle.cuh      |   18 +-
 cub/cub/device/dispatch/dispatch_scan.cuh     |    8 +-
 .../dispatch/dispatch_segmented_sort.cuh      |   20 +-
 .../device/dispatch/dispatch_spmv_orig.cuh    |    8 +-
 .../dispatch/dispatch_unique_by_key.cuh       |   18 +-
 cub/cub/grid/grid_barrier.cuh                 |    9 -
 cub/cub/grid/grid_even_share.cuh              |    9 -
 cub/cub/grid/grid_mapping.cuh                 |   10 -
 cub/cub/grid/grid_queue.cuh                   |   11 -
 cub/cub/iterator/arg_index_input_iterator.cuh |    9 -
 .../cache_modified_input_iterator.cuh         |   11 -
 .../cache_modified_output_iterator.cuh        |    9 -
 cub/cub/iterator/constant_input_iterator.cuh  |   10 -
 cub/cub/iterator/counting_input_iterator.cuh  |    9 -
 cub/cub/iterator/discard_output_iterator.cuh  |    9 -
 cub/cub/iterator/tex_obj_input_iterator.cuh   |   11 -
 cub/cub/iterator/tex_ref_input_iterator.cuh   |    7 -
 cub/cub/iterator/transform_input_iterator.cuh |   10 -
 cub/cub/thread/thread_load.cuh                |    8 -
 cub/cub/thread/thread_operators.cuh           |    9 -
 cub/cub/thread/thread_scan.cuh                |   11 +-
 cub/cub/thread/thread_store.cuh               |   10 -
 cub/cub/util_allocator.cuh                    |   11 -
 cub/cub/util_cpp_dialect.cuh                  |    8 +-
 cub/cub/util_debug.cuh                        |    8 -
 cub/cub/util_device.cuh                       |   10 -
 cub/cub/util_macro.cuh                        |    9 +-
 cub/cub/util_ptx.cuh                          |   14 -
 cub/cub/util_temporary_storage.cuh            |    7 -
 cub/cub/util_type.cuh                         |   10 -
 cub/cub/warp/warp_exchange.cuh                |    1 -
 cub/docs/benchmarking.rst                     |    2 +-
 cub/docs/index.rst                            |    1 +
 cub/docs/repo.toml                            |    3 +-
 60 files changed, 9022 insertions(+), 9511 deletions(-)

diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
index d95cca4e34..0a4d6d0be0 100644
--- a/cub/cub/block/block_load.cuh
+++ b/cub/cub/block/block_load.cuh
@@ -755,7 +755,7 @@ enum BlockLoadAlgorithm
 //!      using CUDA's built-in vectorized loads as a coalescing optimization.
 //!   #. :cpp:enumerator:`cub::BLOCK_LOAD_TRANSPOSE`:
 //!      A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
-//!      locally transposed into a `blocked arrangement <flexible-data-arrangement>`.
+//!      locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
 //!   #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE`:
 //!      A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
 //!      locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index f824e6025c..451a079d97 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -682,7 +682,6 @@ private:
 /**
  * @brief The BlockMergeSort class provides methods for sorting items
  *        partitioned across a CUDA thread block using a merge sorting method.
- * @ingroup BlockModule
  *
  * @tparam KeyT
  *   KeyT type
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 7757dea1bc..09f6e14d20 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -276,7 +276,7 @@ private:
         BlockScan;
 
 
-    /// Shared memory storage layout type for BlockRadixRank
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
     struct __align__(16) _TempStorage
     {
         union Aliasable
@@ -289,6 +289,7 @@ private:
         // Storage for scanning local ranks
         typename BlockScan::TempStorage block_scan;
     };
+    #endif // !DOXYGEN_SHOULD_SKIP_THIS
 
     /// Shared storage reference
     _TempStorage &temp_storage;
@@ -634,7 +635,7 @@ private:
         BlockScanT;
 
 
-    /// Shared memory storage layout type for BlockRadixRank
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
     struct __align__(16) _TempStorage
     {
         typename BlockScanT::TempStorage            block_scan;
@@ -646,6 +647,7 @@ private:
 
         } aliasable;
     };
+    #endif // !DOXYGEN_SHOULD_SKIP_THIS
 
     /// Shared storage reference
     _TempStorage &temp_storage;
@@ -657,7 +659,7 @@ private:
 
 public:
 
-    /// @smemstorage{BlockScan}
+    /// @smemstorage{BlockRadixRankMatch}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
@@ -957,9 +959,6 @@ struct BlockRadixRankMatchEarlyCounts
     // types
     typedef cub::BlockScan<int, BLOCK_THREADS, INNER_SCAN_ALGORITHM> BlockScan;
 
-
-
-    // temporary storage
     struct TempStorage
     {
         union
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
index 538a806973..1d0cb52adb 100644
--- a/cub/cub/block/block_radix_sort.cuh
+++ b/cub/cub/block/block_radix_sort.cuh
@@ -182,8 +182,6 @@ CUB_NAMESPACE_BEGIN
 //! This example can be easily adapted to the storage required by BlockRadixSort.
 //! @endrst
 //!
-//! @ingroup BlockModule
-//!
 //! @tparam KeyT
 //!   KeyT type
 //!
@@ -2231,8 +2229,4 @@ public:
 
 };
 
-/**
- * \example example_block_radix_sort.cu
- */
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh
index cf7dc2fd64..dc240382d1 100644
--- a/cub/cub/block/block_reduce.cuh
+++ b/cub/cub/block/block_reduce.cuh
@@ -105,7 +105,7 @@ enum BlockReduceAlgorithm
     //!    single warp rake across segments of shared partial reductions.
     //! #. A warp-synchronous Kogge-Stone style reduction within the raking warp.
     //! 
-    //! @par Performance Considerations
+    //! Performance Considerations
     //! ++++++++++++++++++++++++++
     //!
     //! - This variant performs more communication than BLOCK_REDUCE_RAKING
diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh
index f181835279..e4544fdc12 100644
--- a/cub/cub/block/block_run_length_decode.cuh
+++ b/cub/cub/block/block_run_length_decode.cuh
@@ -60,9 +60,9 @@ CUB_NAMESPACE_BEGIN
 //! the specified window will be returned.
 //! 
 //! .. note::
-//!   Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). 
-//!   A run of length zero may not be followed by a run length that is not zero.
-//! 
+//!
+//!    Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). 
+//!    A run of length zero may not be followed by a run length that is not zero.
 //!
 //! .. code-block:: c++
 //!
diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh
index 43ff0b190c..5b4b572543 100644
--- a/cub/cub/block/block_shuffle.cuh
+++ b/cub/cub/block/block_shuffle.cuh
@@ -185,7 +185,7 @@ public:
     //! 
     //! - @smemreuse
     //!
-    //! @rst
+    //! @endrst
     //! 
     //! @param[in] input
     //!   The calling thread's input item
@@ -311,7 +311,7 @@ public:
 
     //! @rst 
     //! The thread block rotates its :ref:`blocked arrangement <flexible-data-arrangement>` of input items,
-    //! shifting it down by one item. All threads receive ``input[0]` provided by *thread*\ :sub:`0`.
+    //! shifting it down by one item. All threads receive ``input[0]`` provided by *thread*\ :sub:`0`.
     //! 
     //! - @blocked
     //! - @granularity
diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh
index cffdb5e153..5bcf9badbc 100644
--- a/cub/cub/device/device_adjacent_difference.cuh
+++ b/cub/cub/device/device_adjacent_difference.cuh
@@ -50,63 +50,66 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @brief DeviceAdjacentDifference provides device-wide, parallel operations for
- *        computing the differences of adjacent elements residing within
- *        device-accessible memory.
- *
- * @ingroup SingleModule
- *
- * @par Overview
- * - DeviceAdjacentDifference calculates the differences of adjacent elements in
- *   d_input. Because the binary operation could be noncommutative, there
- *   are two sets of methods. Methods named SubtractLeft subtract left element
- *   `*(i - 1)` of input sequence from current element `*i`.
- *   Methods named `SubtractRight` subtract current element `*i` from the
- *   right one `*(i + 1)`:
- *   @par
- *   @code
- *   int *d_values; // [1, 2, 3, 4]
- *   //...
- *   int *d_subtract_left_result  <-- [  1,  1,  1,  1 ]
- *   int *d_subtract_right_result <-- [ -1, -1, -1,  4 ]
- *   @endcode
- * - For SubtractLeft, if the left element is out of bounds, the iterator is
- *   assigned to <tt>\*(result + (i - first))</tt> without modification.
- * - For SubtractRight, if the right element is out of bounds, the iterator is
- *   assigned to <tt>\*(result + (i - first))</tt> without modification.
- *
- * @par Snippet
- * The code snippet below illustrates how to use @p DeviceAdjacentDifference to
- * compute the left difference between adjacent elements.
- *
- * @par
- * @code
- * #include <cub/cub.cuh>
- * // or equivalently <cub/device/device_adjacent_difference.cuh>
- *
- * // Declare, allocate, and initialize device-accessible pointers
- * int  num_items;       // e.g., 8
- * int  *d_values;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
- * //...
- *
- * // Determine temporary device storage requirements
- * void     *d_temp_storage = NULL;
- * size_t   temp_storage_bytes = 0;
- *
- * cub::DeviceAdjacentDifference::SubtractLeft(
- *   d_temp_storage, temp_storage_bytes, d_values, num_items);
- *
- * // Allocate temporary storage
- * cudaMalloc(&d_temp_storage, temp_storage_bytes);
- *
- * // Run operation
- * cub::DeviceAdjacentDifference::SubtractLeft(
- *   d_temp_storage, temp_storage_bytes, d_values, num_items);
- *
- * // d_values <-- [1, 1, -1, 1, -1, 1, -1, 1]
- * @endcode
- */
+//! @rst
+//! DeviceAdjacentDifference provides device-wide, parallel operations for
+//! computing the differences of adjacent elements residing within
+//! device-accessible memory.
+//! 
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! - DeviceAdjacentDifference calculates the differences of adjacent elements in
+//!   d_input. Because the binary operation could be noncommutative, there
+//!   are two sets of methods. Methods named SubtractLeft subtract left element
+//!   ``*(i - 1)`` of input sequence from current element ``*i``.
+//!   Methods named ``SubtractRight`` subtract current element ``*i`` from the
+//!   right one ``*(i + 1)``:
+//!
+//!   .. code-block:: c++
+//!
+//!      int *d_values; // [1, 2, 3, 4]
+//!      //...
+//!      int *d_subtract_left_result  <-- [  1,  1,  1,  1 ]
+//!      int *d_subtract_right_result <-- [ -1, -1, -1,  4 ]
+//!
+//! - For SubtractLeft, if the left element is out of bounds, the iterator is
+//!   assigned to ``*(result + (i - first))`` without modification.
+//! - For SubtractRight, if the right element is out of bounds, the iterator is
+//!   assigned to ``*(result + (i - first))`` without modification.
+//! 
+//! Snippet
+//! ++++++++++++++++++++++++++
+//!
+//! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` to
+//! compute the left difference between adjacent elements.
+//! 
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+//!
+//!    // Declare, allocate, and initialize device-accessible pointers
+//!    int  num_items;       // e.g., 8
+//!    int  *d_values;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+//!    //...
+//!
+//!    // Determine temporary device storage requirements
+//!    void     *d_temp_storage = NULL;
+//!    size_t   temp_storage_bytes = 0;
+//!
+//!    cub::DeviceAdjacentDifference::SubtractLeft(
+//!      d_temp_storage, temp_storage_bytes, d_values, num_items);
+//!
+//!    // Allocate temporary storage
+//!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//!
+//!    // Run operation
+//!    cub::DeviceAdjacentDifference::SubtractLeft(
+//!      d_temp_storage, temp_storage_bytes, d_values, num_items);
+//!
+//!    // d_values <-- [1, 1, -1, 1, -1, 1, -1, 1]
+//!
+//! @endrst
 struct DeviceAdjacentDifference
 {
 private:
@@ -145,106 +148,113 @@ private:
 
 public:
 
-  /**
-   * @brief Subtracts the left element of each adjacent pair of elements residing within device-accessible memory.
-   * @ingroup SingleModule
-   *
-   * @par Overview
-   * - Calculates the differences of adjacent elements in `d_input`. That is,
-   *   `*d_input` is assigned to `*d_output`, and, for each iterator `i` in the
-   *   range `[d_input + 1, d_input + num_items)`, the result of
-   *   `difference_op(*i, *(i - 1))` is assigned to
-   *   `*(d_output + (i - d_input))`.
-   * - Note that the behavior is undefined if the input and output ranges
-   *   overlap in any way.
-   *
-   * @par Snippet
-   * The code snippet below illustrates how to use @p DeviceAdjacentDifference
-   * to compute the difference between adjacent elements.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_adjacent_difference.cuh>
-   *
-   * struct CustomDifference
-   * {
-   *   template <typename DataType>
-   *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-   *   {
-   *     return lhs - rhs;
-   *   }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * int  num_items;      // e.g., 8
-   * int  *d_input;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
-   * int  *d_output;
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   *
-   * cub::DeviceAdjacentDifference::SubtractLeftCopy(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_input, d_output,
-   *   num_items, CustomDifference());
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run operation
-   * cub::DeviceAdjacentDifference::SubtractLeftCopy(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_input, d_output,
-   *   num_items, CustomDifference());
-   *
-   * // d_input  <-- [1, 2, 1, 2, 1, 2, 1, 2]
-   * // d_output <-- [1, 1, -1, 1, -1, 1, -1, 1]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
-   *   and `x` and `y` are objects of `InputIteratorT`'s `value_type`, then
-   *   `x - y` is defined, and `InputIteratorT`'s `value_type` is convertible to
-   *   a type in `OutputIteratorT`'s set of `value_types`, and the return type
-   *   of `x - y` is convertible to a type in `OutputIteratorT`'s set of
-   *   `value_types`.
-   *
-   * @tparam OutputIteratorT
-   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
-   *
-   * @tparam DifferenceOpT
-   *   Its `result_type` is convertible to a type in `OutputIteratorT`'s set of
-   *   `value_types`.
-   *
-   * @tparam NumItemsT **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_input
-   *   Pointer to the input sequence
-   *
-   * @param[out] d_output
-   *   Pointer to the output sequence
-   *
-   * @param[in] num_items
-   *   Number of items in the input sequence
-   *
-   * @param[in] difference_op
-   *   The binary function used to compute differences
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>
-   */
+  //! @rst
+  //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory
+  //! 
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - Calculates the differences of adjacent elements in ``d_input``. 
+  //!   That is, ``*d_input`` is assigned to ``*d_output``, and, for each iterator ``i`` in the
+  //!   range ``[d_input + 1, d_input + num_items)``, the result of
+  //!   ``difference_op(*i, *(i - 1))`` is assigned to ``*(d_output + (i - d_input))``.
+  //! - Note that the behavior is undefined if the input and output ranges
+  //!   overlap in any way.
+  //! 
+  //! Snippet
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates how to use ``DeviceAdjacentDifference``
+  //! to compute the difference between adjacent elements.
+  //! 
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+  //! 
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    int  num_items;      // e.g., 8
+  //!    int  *d_input;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    int  *d_output;
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //! 
+  //!    cub::DeviceAdjacentDifference::SubtractLeftCopy(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_input, d_output,
+  //!      num_items, CustomDifference());
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run operation
+  //!    cub::DeviceAdjacentDifference::SubtractLeftCopy(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_input, d_output,
+  //!      num_items, CustomDifference());
+  //! 
+  //!    // d_input  <-- [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    // d_output <-- [1, 1, -1, 1, -1, 1, -1, 1]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   @rst
+  //!   is a model of `Input Iterator <https://en.cppreference.com/w/cpp/iterator/input_iterator>`_,
+  //!   and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then
+  //!   ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to
+  //!   a type in ``OutputIteratorT``'s set of ``value_types``, and the return type
+  //!   of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of
+  //!   ``value_types``.
+  //!   @endrst
+  //! 
+  //! @tparam OutputIteratorT
+  //!   @rst
+  //!   is a model of `Output Iterator <https://en.cppreference.com/w/cpp/iterator/output_iterator>`_.
+  //!   @endrst
+  //! 
+  //! @tparam DifferenceOpT
+  //!   Its `result_type` is convertible to a type in `OutputIteratorT`'s set of `value_types`.
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of num_items
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_input
+  //!   Pointer to the input sequence
+  //! 
+  //! @param[out] d_output
+  //!   Pointer to the output sequence
+  //! 
+  //! @param[in] num_items
+  //!   Number of items in the input sequence
+  //! 
+  //! @param[in] difference_op
+  //!   The binary function used to compute differences
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename DifferenceOpT = cub::Difference,
@@ -296,93 +306,99 @@ public:
                             stream);
   }
 
-  /**
-   * @brief Subtracts the left element of each adjacent pair of elements
-   *        residing within device-accessible memory.
-   *
-   * @ingroup SingleModule
-   *
-   * @par Overview
-   * Calculates the differences of adjacent elements in `d_input`. That is, for
-   * each iterator `i` in the range `[d_input + 1, d_input + num_items)`, the
-   * result of `difference_op(*i, *(i - 1))` is assigned to
-   * `*(d_input + (i - d_input))`.
-   *
-   * @par Snippet
-   * The code snippet below illustrates how to use @p DeviceAdjacentDifference
-   * to compute the difference between adjacent elements.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_adjacent_difference.cuh>
-   *
-   * struct CustomDifference
-   * {
-   *   template <typename DataType>
-   *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-   *   {
-   *     return lhs - rhs;
-   *   }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * int  num_items;     // e.g., 8
-   * int  *d_data;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceAdjacentDifference::SubtractLeft(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, num_items, CustomDifference());
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run operation
-   * cub::DeviceAdjacentDifference::SubtractLeft(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, num_items, CustomDifference());
-   *
-   * // d_data <-- [1, 1, -1, 1, -1, 1, -1, 1]
-   * @endcode
-   *
-   * @tparam RandomAccessIteratorT
-   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
-   *   `RandomAccessIteratorT` is mutable. If `x` and `y` are objects of
-   *   `RandomAccessIteratorT`'s `value_type`, and `x - y` is defined, then the
-   *   return type of `x - y` should be convertible to a type in
-   *   `RandomAccessIteratorT`'s set of `value_types`.
-   *
-   * @tparam DifferenceOpT
-   *   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
-   *   set of `value_types`.
-   *
-   * @tparam NumItemsT **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in,out] d_input
-   *   Pointer to the input sequence and the result
-   *
-   * @param[in] num_items
-   *   Number of items in the input sequence
-   *
-   * @param[in] difference_op
-   *   The binary function used to compute differences
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory.
+  //! 
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Calculates the differences of adjacent elements in ``d_input``. That is, for
+  //! each iterator ``i`` in the range ``[d_input + 1, d_input + num_items)``, the
+  //! result of ``difference_op(*i, *(i - 1))`` is assigned to
+  //! ``*(d_input + (i - d_input))``.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates how to use ``DeviceAdjacentDifference``
+  //! to compute the difference between adjacent elements.
+  //! 
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+  //! 
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    int  num_items;     // e.g., 8
+  //!    int  *d_data;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceAdjacentDifference::SubtractLeft(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items, CustomDifference());
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run operation
+  //!    cub::DeviceAdjacentDifference::SubtractLeft(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items, CustomDifference());
+  //! 
+  //!    // d_data <-- [1, 1, -1, 1, -1, 1, -1, 1]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam RandomAccessIteratorT
+  //!   @rst
+  //!   is a model of `Random Access Iterator <https://en.cppreference.com/w/cpp/iterator/random_access_iterator>`_,
+  //!   ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of
+  //!   ``RandomAccessIteratorT``'s ``value_type``, and ``x - y`` is defined, then the
+  //!   return type of ``x - y`` should be convertible to a type in
+  //!   ``RandomAccessIteratorT``'s set of ``value_types``.
+  //!   @endrst
+  //! 
+  //! @tparam DifferenceOpT
+  //!   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
+  //!   set of `value_types`.
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of `num_items`
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_input
+  //!   Pointer to the input sequence and the result
+  //! 
+  //! @param[in] num_items
+  //!   Number of items in the input sequence
+  //! 
+  //! @param[in] difference_op
+  //!   The binary function used to compute differences
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename RandomAccessIteratorT,
             typename DifferenceOpT = cub::Difference,
             typename NumItemsT = std::uint32_t>
@@ -429,106 +445,114 @@ public:
                         stream);
   }
 
-  /**
-   * @brief Subtracts the right element of each adjacent pair of elements
-   *        residing within device-accessible memory.
-   *
-   * @ingroup SingleModule
-   *
-   * @par Overview
-   * - Calculates the right differences of adjacent elements in `d_input`. That
-   *   is, `*(d_input + num_items - 1)` is assigned to
-   *   `*(d_output + num_items - 1)`, and, for each iterator `i` in the range
-   *   `[d_input, d_input + num_items - 1)`, the result of
-   *   `difference_op(*i, *(i + 1))` is assigned to
-   *   `*(d_output + (i - d_input))`.
-   * - Note that the behavior is undefined if the input and output ranges
-   *   overlap in any way.
-   *
-   * @par Snippet
-   * The code snippet below illustrates how to use @p DeviceAdjacentDifference
-   * to compute the difference between adjacent elements.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_adjacent_difference.cuh>
-   *
-   * struct CustomDifference
-   * {
-   *   template <typename DataType>
-   *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
-   *   {
-   *     return lhs - rhs;
-   *   }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * int  num_items;     // e.g., 8
-   * int  *d_input;      // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
-   * int  *d_output;
-   * ..
-   *
-   * // Determine temporary device storage requirements
-   * void *d_temp_storage = nullptr;
-   * size_t temp_storage_bytes = 0;
-   * cub::DeviceAdjacentDifference::SubtractRightCopy(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_input, d_output, num_items, CustomDifference());
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run operation
-   * cub::DeviceAdjacentDifference::SubtractRightCopy(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_input, d_output, num_items, CustomDifference());
-   *
-   * // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2]
-   * // d_data  <-- [-1, 1, -1, 1, -1, 1, -1, 2]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
-   *   and `x` and `y` are objects of `InputIteratorT`'s `value_type`, then
-   *   `x - y` is defined, and `InputIteratorT`'s `value_type` is convertible to
-   *   a type in `OutputIteratorT`'s set of `value_types`, and the return type
-   *   of `x - y` is convertible to a type in `OutputIteratorT`'s set of
-   *   `value_types`.
-   *
-   * @tparam OutputIteratorT
-   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
-   *
-   * @tparam DifferenceOpT
-   *   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
-   *   set of `value_types`.
-   *
-   * @tparam NumItemsT **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_input
-   *   Pointer to the input sequence
-   *
-   * @param[out] d_output
-   *   Pointer to the output sequence
-   *
-   * @param[in] num_items
-   *   Number of items in the input sequence
-   *
-   * @param[in] difference_op
-   *   The binary function used to compute differences.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
+  //! 
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - Calculates the right differences of adjacent elements in ``d_input``. 
+  //!   That is, ``*(d_input + num_items - 1)`` is assigned to
+  //!   ``*(d_output + num_items - 1)``, and, for each iterator ``i`` in the range
+  //!   ``[d_input, d_input + num_items - 1)``, the result of
+  //!   ``difference_op(*i, *(i + 1))`` is assigned to
+  //!   ``*(d_output + (i - d_input))``.
+  //! - Note that the behavior is undefined if the input and output ranges
+  //!   overlap in any way.
+  //! 
+  //! Snippet
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates how to use ``DeviceAdjacentDifference``
+  //! to compute the difference between adjacent elements.
+  //! 
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+  //! 
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __device__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    int  num_items;     // e.g., 8
+  //!    int  *d_input;      // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    int  *d_output;
+  //!    ..
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceAdjacentDifference::SubtractRightCopy(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_input, d_output, num_items, CustomDifference());
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run operation
+  //!    cub::DeviceAdjacentDifference::SubtractRightCopy(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_input, d_output, num_items, CustomDifference());
+  //! 
+  //!    // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    // d_data  <-- [-1, 1, -1, 1, -1, 1, -1, 2]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   @rst
+  //!   is a model of `Input Iterator <https://en.cppreference.com/w/cpp/iterator/input_iterator>`_,
+  //!   and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then
+  //!   ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to
+  //!   a type in ``OutputIteratorT``'s set of ``value_types``, and the return type
+  //!   of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of
+  //!   ``value_types``.
+  //!   @endrst
+  //! 
+  //! @tparam OutputIteratorT
+  //!   @rst
+  //!   is a model of `Output Iterator <https://en.cppreference.com/w/cpp/iterator/output_iterator>`_.
+  //!   @endrst
+  //! 
+  //! @tparam DifferenceOpT
+  //!   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
+  //!   set of `value_types`.
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of num_items
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_input
+  //!   Pointer to the input sequence
+  //! 
+  //! @param[out] d_output
+  //!   Pointer to the output sequence
+  //! 
+  //! @param[in] num_items
+  //!   Number of items in the input sequence
+  //! 
+  //! @param[in] difference_op
+  //!   The binary function used to compute differences.
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename DifferenceOpT = cub::Difference,
@@ -580,83 +604,88 @@ public:
                              stream);
   }
 
-  /**
-   * @brief Subtracts the right element of each adjacent pair of elements
-   *        residing within device-accessible memory.
-   *
-   * @ingroup SingleModule
-   *
-   * @par Overview
-   * Calculates the right differences of adjacent elements in `d_input`. That
-   * is, for each iterator `i` in the range
-   * `[d_input, d_input + num_items - 1)`, the result of
-   * `difference_op(*i, *(i + 1))` is assigned to
-   * `*(d_input + (i - d_input))`.
-   *
-   * @par Snippet
-   * The code snippet below illustrates how to use @p DeviceAdjacentDifference
-   * to compute the difference between adjacent elements.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_adjacent_difference.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * int  num_items;    // e.g., 8
-   * int  *d_data;      // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void *d_temp_storage = NULL;
-   * size_t temp_storage_bytes = 0;
-   * cub::DeviceAdjacentDifference::SubtractRight(
-   *   d_temp_storage, temp_storage_bytes, d_data, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run operation
-   * cub::DeviceAdjacentDifference::SubtractRight(
-   *   d_temp_storage, temp_storage_bytes, d_data, num_items);
-   *
-   * // d_data  <-- [-1, 1, -1, 1, -1, 1, -1, 2]
-   * @endcode
-   *
-   * @tparam RandomAccessIteratorT
-   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
-   *   `RandomAccessIteratorT` is mutable. If `x` and `y` are objects of
-   *   `RandomAccessIteratorT`'s `value_type`, and `x - y` is defined, then the
-   *   return type of `x - y` should be convertible to a type in
-   *   `RandomAccessIteratorT`'s set of `value_types`.
-   *
-   * @tparam DifferenceOpT
-   *   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
-   *   set of `value_types`.
-   *
-   * @tparam NumItemsT **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_input
-   *   Pointer to the input sequence
-   *
-   * @param[in] num_items
-   *   Number of items in the input sequence
-   *
-   * @param[in] difference_op
-   *   The binary function used to compute differences
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
+  //! 
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Calculates the right differences of adjacent elements in ``d_input``. 
+  //! That is, for each iterator ``i`` in the range
+  //! ``[d_input, d_input + num_items - 1)``, the result of
+  //! ``difference_op(*i, *(i + 1))`` is assigned to ``*(d_input + (i - d_input))``.
+  //! 
+  //! Snippet
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates how to use ``DeviceAdjacentDifference``
+  //! to compute the difference between adjacent elements.
+  //! 
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    int  num_items;    // e.g., 8
+  //!    int  *d_data;      // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage = NULL;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceAdjacentDifference::SubtractRight(
+  //!      d_temp_storage, temp_storage_bytes, d_data, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run operation
+  //!    cub::DeviceAdjacentDifference::SubtractRight(
+  //!      d_temp_storage, temp_storage_bytes, d_data, num_items);
+  //! 
+  //!    // d_data  <-- [-1, 1, -1, 1, -1, 1, -1, 2]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam RandomAccessIteratorT
+  //!   @rst
+  //!   is a model of `Random Access Iterator <https://en.cppreference.com/w/cpp/iterator/random_access_iterator>`_,
+  //!   ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of
+  //!   ``RandomAccessIteratorT``'s `value_type`, and ``x - y`` is defined, then the
+  //!   return type of ``x - y`` should be convertible to a type in
+  //!   ``RandomAccessIteratorT``'s set of ``value_types``.
+  //!   @endrst
+  //! 
+  //! @tparam DifferenceOpT
+  //!   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
+  //!   set of `value_types`.
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of num_items
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_input
+  //!   Pointer to the input sequence
+  //! 
+  //! @param[in] num_items
+  //!   Number of items in the input sequence
+  //! 
+  //! @param[in] difference_op
+  //!   The binary function used to compute differences
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename RandomAccessIteratorT,
             typename DifferenceOpT = cub::Difference,
             typename NumItemsT = std::uint32_t>
diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh
index f6c9151f4a..946f03f57f 100644
--- a/cub/cub/device/device_copy.cuh
+++ b/cub/cub/device/device_copy.cuh
@@ -25,10 +25,7 @@
  *
  ******************************************************************************/
 
-/**
- * \file
- * cub::DeviceCopy provides device-wide, parallel operations for copying data.
- */
+//! @file cub::DeviceCopy provides device-wide, parallel operations for copying data.
 
 #pragma once
 
@@ -50,101 +47,119 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @brief cub::DeviceCopy provides device-wide, parallel operations for copying data.
- * \ingroup SingleModule
- */
+//! @brief cub::DeviceCopy provides device-wide, parallel operations for copying data.
 struct DeviceCopy
 {
-  /**
-   * @brief Copies data from a batch of given source ranges to their corresponding destination
-   * ranges.
-   * @note If any input range aliases any output range the behavior is undefined. If
-   * any output range aliases another output range the behavior is undefined. Input
-   * ranges can alias one another.
-   *
-   * @par Snippet
-   * The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength
-   * Decode operation.
-   * @par
-   * @code
-   * struct GetIteratorToRange
-   * {
-   *   __host__ __device__ __forceinline__ auto operator()(uint32_t index)
-   *   {
-   *     return thrust::make_constant_iterator(d_data_in[index]);
-   *   }
-   *   int32_t *d_data_in;
-   * };
-   *
-   * struct GetPtrToRange
-   * {
-   *   __host__ __device__ __forceinline__ auto operator()(uint32_t index)
-   *   {
-   *     return d_data_out + d_offsets[index];
-   *   }
-   *   int32_t *d_data_out;
-   *   uint32_t *d_offsets;
-   * };
-   *
-   * struct GetRunLength
-   * {
-   *   __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
-   *   {
-   *     return d_offsets[index + 1] - d_offsets[index];
-   *   }
-   *   uint32_t *d_offsets;
-   * };
-   *
-   * uint32_t num_ranges = 5;
-   * int32_t *d_data_in;           // e.g., [4, 2, 7, 3, 1]
-   * int32_t *d_data_out;          // e.g., [0,                ...               ]
-   * uint32_t *d_offsets;          // e.g., [0, 2, 5, 6, 9, 14]
-   *
-   * // Returns a constant iterator to the element of the i-th run
-   * thrust::counting_iterator<uint32_t> iota(0);
-   * auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in});
-   *
-   * // Returns the run length of the i-th run
-   * auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets});
-   *
-   * // Returns pointers to the output range for each run
-   * auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets});
-   *
-   * // Determine temporary device storage requirements
-   * void *d_temp_storage      = nullptr;
-   * size_t temp_storage_bytes = 0;
-   * cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
-   * num_ranges);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run batched copy algorithm (used to perform runlength decoding)
-   * cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
-   * num_ranges);
-   *
-   * // d_data_out       <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1]
-   * @endcode
-   * @tparam InputIt <b>[inferred]</b> Device-accessible random-access input iterator type
-   * providing the iterators to the source ranges
-   * @tparam OutputIt <b>[inferred]</b> Device-accessible random-access input iterator type
-   * providing the iterators to the destination ranges
-   * @tparam SizeIteratorT <b>[inferred]</b> Device-accessible random-access input iterator
-   * type providing the number of items to be copied for each pair of ranges
-   * @param d_temp_storage [in] Device-accessible allocation of temporary storage.  When NULL, the
-   * required allocation size is written to \p temp_storage_bytes and no work is done.
-   * @param temp_storage_bytes [in,out] Reference to size in bytes of \p d_temp_storage allocation
-   * @param input_it [in] Device-accessible iterator providing the iterators to the source
-   * ranges
-   * @param output_it [in] Device-accessible iterator providing the iterators to the
-   * destination ranges
-   * @param sizes [in] Device-accessible iterator providing the number of elements to be copied
-   * for each pair of ranges
-   * @param num_ranges [in] The total number of range pairs
-   * @param stream [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   * stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Copies data from a batch of given source ranges to their corresponding destination ranges.
+  //!
+  //! .. note::
+  //!
+  //!    If any input range aliases any output range the behavior is undefined.
+  //!    If any output range aliases another output range the behavior is undefined.
+  //!    Input ranges can alias one another.
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength Decode operation.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    struct GetIteratorToRange
+  //!    {
+  //!      __host__ __device__ __forceinline__ auto operator()(uint32_t index)
+  //!      {
+  //!        return thrust::make_constant_iterator(d_data_in[index]);
+  //!      }
+  //!      int32_t *d_data_in;
+  //!    };
+  //!
+  //!    struct GetPtrToRange
+  //!    {
+  //!      __host__ __device__ __forceinline__ auto operator()(uint32_t index)
+  //!      {
+  //!        return d_data_out + d_offsets[index];
+  //!      }
+  //!      int32_t *d_data_out;
+  //!      uint32_t *d_offsets;
+  //!    };
+  //!
+  //!    struct GetRunLength
+  //!    {
+  //!      __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
+  //!      {
+  //!        return d_offsets[index + 1] - d_offsets[index];
+  //!      }
+  //!      uint32_t *d_offsets;
+  //!    };
+  //!
+  //!    uint32_t num_ranges = 5;
+  //!    int32_t *d_data_in;           // e.g., [4, 2, 7, 3, 1]
+  //!    int32_t *d_data_out;          // e.g., [0,                ...               ]
+  //!    uint32_t *d_offsets;          // e.g., [0, 2, 5, 6, 9, 14]
+  //!
+  //!    // Returns a constant iterator to the element of the i-th run
+  //!    thrust::counting_iterator<uint32_t> iota(0);
+  //!    auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in});
+  //!
+  //!    // Returns the run length of the i-th run
+  //!    auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets});
+  //!
+  //!    // Returns pointers to the output range for each run
+  //!    auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets});
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage      = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
+  //!    num_ranges);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run batched copy algorithm (used to perform runlength decoding)
+  //!    cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
+  //!    num_ranges);
+  //!
+  //!    // d_data_out       <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIt
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the iterators to the source ranges
+  //!
+  //! @tparam OutputIt 
+  //!  **[inferred]** Device-accessible random-access input iterator type providing the iterators to 
+  //!  the destination ranges
+  //!
+  //! @tparam SizeIteratorT 
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the number of items to be 
+  //!   copied for each pair of ranges
+  //!
+  //! @param[in] d_temp_storage 
+  //!   Device-accessible allocation of temporary storage.  
+  //!   When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes 
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] input_it 
+  //!   Device-accessible iterator providing the iterators to the source ranges
+  //!
+  //! @param[in] output_it 
+  //!   Device-accessible iterator providing the iterators to the destination ranges
+  //!
+  //! @param[in] sizes 
+  //!   Device-accessible iterator providing the number of elements to be copied for each pair of ranges
+  //!
+  //! @param[in] num_ranges 
+  //!   The total number of range pairs
+  //!
+  //! @param[in] stream 
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIt, typename OutputIt, typename SizeIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t Batched(void *d_temp_storage,
                                                   size_t &temp_storage_bytes,
diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh
index 8047a1a463..a6a3e0edd2 100644
--- a/cub/cub/device/device_histogram.cuh
+++ b/cub/cub/device/device_histogram.cuh
@@ -26,11 +26,9 @@
  *
  ******************************************************************************/
 
-/**
- * @file cub::DeviceHistogram provides device-wide parallel operations for
- *       constructing histogram(s) from a sequence of samples data residing
- *       within device-accessible memory.
- */
+//! @file cub::DeviceHistogram provides device-wide parallel operations for
+//!       constructing histogram(s) from a sequence of samples data residing
+//!       within device-accessible memory.
 
 #pragma once
 
@@ -56,134 +54,131 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @brief DeviceHistogram provides device-wide parallel operations for
- *        constructing histogram(s) from a sequence of samples data residing
- *        within device-accessible memory. ![](histogram_logo.png)
- * @ingroup SingleModule
- *
- * @par Overview
- * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- *
- * @par Usage Considerations
- * @cdp_class{DeviceHistogram}
- *
- */
+//! @rst
+//! DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of 
+//! samples data residing within device-accessible memory.
+//! 
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! A `histogram <http://en.wikipedia.org/wiki/Histogram>`_ counts the number of observations that fall into each 
+//! of the disjoint categories (known as *bins*).
+//! 
+//! Usage Considerations
+//! ++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceHistogram}
+//!
+//! @endrst
 struct DeviceHistogram
 {
-  /******************************************************************//**
-   * @name Evenly-segmented bin ranges
-   *********************************************************************/
-  //@{
+  //! @name Evenly-segmented bin ranges
+  //! @{
 
-  /**
-   * @brief Computes an intensity histogram from a sequence of data samples
-   *        using equal-width bins.
-   *
-   * @par
-   * - The number of histogram bins is (`num_levels - 1`)
-   * - All bins comprise the same width of sample values:
-   *   `(upper_level - lower_level) / (num_levels - 1)`.
-   * - If the common type of `SampleT` and `LevelT` is of integral type, the bin for a sample is
-   *   computed as `(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)`, round
-   *   down to the nearest whole number. To protect against potential overflows, if the product
-   *   `(upper_level - lower_level) * (num_levels - 1)` exceeds the number representable by an
-   *   `uint64_t`, the cuda error `cudaErrorInvalidValue` is returned. If the common type is 128
-   *   bits wide, bin computation will use 128-bit arithmetic and `cudaErrorInvalidValue` will only
-   *   be returned if bin computation would overflow for 128-bit arithmetic.
-   * - The ranges `[d_samples, d_samples + num_samples)` and
-   *   `[d_histogram, d_histogram + num_levels - 1)` shall not overlap
-   *   in any way.
-   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
-   *   and SampleT must be valid arithmetic types. The common type must be
-   *   convertible to `int` and trivially copyable.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the computation of a six-bin histogram
-   * from a sequence of float samples
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input samples and output histogram
-   * int      num_samples;    // e.g., 10
-   * float*   d_samples;      // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5]
-   * int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
-   * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
-   * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
-   * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void*    d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceHistogram::HistogramEven(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels,
-   *   lower_level, upper_level, num_samples);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Compute histograms
-   * cub::DeviceHistogram::HistogramEven(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels,
-   *   lower_level, upper_level, num_samples);
-   *
-   * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
-   * @endcode
-   *
-   * @tparam SampleIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   samples \iterator
-   *
-   * @tparam CounterT
-   *   **[inferred]** Integer type for histogram bin counters
-   *
-   * @tparam LevelT
-   *   **[inferred]** Type for specifying boundaries (levels)
-   *
-   * @tparam OffsetT
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
-   *   pointer differences, etc.  \offset_size1
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_samples
-   *   The pointer to the input sequence of data samples.
-   *
-   * @param[out] d_histogram
-   *   The pointer to the histogram counter output array of length
-   *   `num_levels - 1`.
-   *
-   * @param[in] num_levels
-   *   The number of boundaries (levels) for delineating histogram samples.
-   *   Implies that the number of bins is `num_levels - 1`.
-   *
-   * @param[in] lower_level
-   *   The lower sample value bound (inclusive) for the lowest histogram bin.
-   *
-   * @param[in] upper_level
-   *   The upper sample value bound (exclusive) for the highest histogram bin.
-   *
-   * @param[in] num_samples
-   *   The number of input samples (i.e., the length of `d_samples`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes an intensity histogram from a sequence of data samples using equal-width bins.
+  //! 
+  //! - The number of histogram bins is (``num_levels - 1``)
+  //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)``.
+  //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is
+  //!   computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round
+  //!   down to the nearest whole number. To protect against potential overflows, if the product
+  //!   ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an
+  //!   ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128
+  //!   bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only
+  //!   be returned if bin computation would overflow for 128-bit arithmetic.
+  //! - The ranges ``[d_samples, d_samples + num_samples)`` and
+  //!   ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap in any way.
+  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT and SampleT must be valid 
+  //!   arithmetic types. The common type must be convertible to ``int`` and trivially copyable.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of a six-bin histogram
+  //! from a sequence of float samples
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input samples and output histogram
+  //!    int      num_samples;    // e.g., 10
+  //!    float*   d_samples;      // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5]
+  //!    int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
+  //!    int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
+  //!    float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
+  //!    float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::HistogramEven(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      lower_level, upper_level, num_samples);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::HistogramEven(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      lower_level, upper_level, num_samples);
+  //! 
+  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+  //!
+  //! @endrst
+  //! 
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input samples @iterator
+  //! 
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //! 
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //! 
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_samples
+  //!   The pointer to the input sequence of data samples.
+  //! 
+  //! @param[out] d_histogram
+  //!   The pointer to the histogram counter output array of length
+  //!   `num_levels - 1`.
+  //! 
+  //! @param[in] num_levels
+  //!   The number of boundaries (levels) for delineating histogram samples.
+  //!   Implies that the number of bins is `num_levels - 1`.
+  //! 
+  //! @param[in] lower_level
+  //!   The lower sample value bound (inclusive) for the lowest histogram bin.
+  //! 
+  //! @param[in] upper_level
+  //!   The upper sample value bound (exclusive) for the highest histogram bin.
+  //! 
+  //! @param[in] num_samples
+  //!   The number of input samples (i.e., the length of `d_samples`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename SampleIteratorT,
             typename CounterT,
             typename LevelT,
@@ -250,130 +245,129 @@ struct DeviceHistogram
                          stream);
   }
 
-  /**
-   * @brief Computes an intensity histogram from a sequence of data samples
-   *        using equal-width bins.
-   *
-   * @par
-   * - A two-dimensional *region of interest* within `d_samples` can be
-   *   specified using the `num_row_samples`, `num_rows`, and
-   *   `row_stride_bytes` parameters.
-   * - The row stride must be a whole multiple of the sample data type
-   *   size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
-   * - The number of histogram bins is (`num_levels - 1`)
-   * - All bins comprise the same width of sample values:
-   *   `(upper_level - lower_level) / (num_levels - 1)`
-   * - If the common type of `SampleT` and `LevelT` is of integral type, the bin for a sample is
-   *   computed as `(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)`, round
-   *   down to the nearest whole number. To protect against potential overflows, if the product
-   *   `(upper_level - lower_level) * (num_levels - 1)` exceeds the number representable by an
-   *   `uint64_t`, the cuda error `cudaErrorInvalidValue` is returned. If the common type is 128
-   *   bits wide, bin computation will use 128-bit arithmetic and `cudaErrorInvalidValue` will only
-   *   be returned if bin computation would overflow for 128-bit arithmetic.
-   * - For a given row `r` in `[0, num_rows)`, let
-   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and
-   *   `row_end = row_begin + num_row_samples`. The ranges
-   *   `[row_begin, row_end)` and `[d_histogram, d_histogram + num_levels - 1)`
-   *   shall not overlap in any way.
-   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
-   *   and SampleT must be valid arithmetic types. The common type must be
-   *   convertible to `int` and trivially copyable.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the computation of a six-bin histogram
-   * from a 2x5 region of interest within a flattened 2x7 array of float samples.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input samples and output histogram
-   * int      num_row_samples;    // e.g., 5
-   * int      num_rows;           // e.g., 2;
-   * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
-   * float*   d_samples;          // e.g., [2.2, 6.1, 7.1, 2.9, 3.5,   -, -,
-   *                              //        0.3, 2.9, 2.1, 6.1, 999.5, -, -]
-   * int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
-   * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
-   * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
-   * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void*    d_temp_storage  = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceHistogram::HistogramEven(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-   *     num_row_samples, num_rows, row_stride_bytes);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Compute histograms
-   * cub::DeviceHistogram::HistogramEven(
-   *     d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
-   *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-   *     num_row_samples, num_rows, row_stride_bytes);
-   *
-   * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
-   * @endcode
-   *
-   * @tparam SampleIteratorT
-   *   **[inferred]** Random-access input iterator type for reading
-   *   input samples. \iterator
-   *
-   * @tparam CounterT
-   *   **[inferred]** Integer type for histogram bin counters
-   *
-   * @tparam LevelT
-   *   **[inferred]** Type for specifying boundaries (levels)
-   *
-   * @tparam OffsetT
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
-   *   pointer differences, etc. \offset_size1
-
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_samples
-   *   The pointer to the input sequence of data samples.
-   *
-   * @param[out] d_histogram
-   *   The pointer to the histogram counter output array of
-   *   length `num_levels - 1`.
-   *
-   * @param[in] num_levels
-   *   The number of boundaries (levels) for delineating histogram samples.
-   *   Implies that the number of bins is `num_levels - 1`.
-   *
-   * @param[in] lower_level
-   *   The lower sample value bound (inclusive) for the lowest histogram bin.
-   *
-   * @param[in] upper_level
-   *   The upper sample value bound (exclusive) for the highest histogram bin.
-   *
-   * @param[in] num_row_samples
-   *   The number of data samples per row in the region of interest
-   *
-   * @param[in] num_rows
-   *   The number of rows in the region of interest
-   *
-   * @param[in] row_stride_bytes
-   *   The number of bytes between starts of consecutive rows in
-   *   the region of interest
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes an intensity histogram from a sequence of data samples using equal-width bins.
+  //! 
+  //! - A two-dimensional *region of interest* within ``d_samples`` can be specified using 
+  //!   the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
+  //! - The row stride must be a whole multiple of the sample data type
+  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
+  //! - The number of histogram bins is (``num_levels - 1``)
+  //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)``
+  //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is
+  //!   computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round
+  //!   down to the nearest whole number. To protect against potential overflows, if the product
+  //!   ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an
+  //!   ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128
+  //!   bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only
+  //!   be returned if bin computation would overflow for 128-bit arithmetic.
+  //! - For a given row ``r`` in ``[0, num_rows)``, let
+  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and
+  //!   ``row_end = row_begin + num_row_samples``. The ranges
+  //!   ``[row_begin, row_end)`` and ``[d_histogram, d_histogram + num_levels - 1)``
+  //!   shall not overlap in any way.
+  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT
+  //!   and SampleT must be valid arithmetic types. The common type must be
+  //!   convertible to ``int`` and trivially copyable.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of a six-bin histogram
+  //! from a 2x5 region of interest within a flattened 2x7 array of float samples.
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input samples and output histogram
+  //!    int      num_row_samples;    // e.g., 5
+  //!    int      num_rows;           // e.g., 2;
+  //!    size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
+  //!    float*   d_samples;          // e.g., [2.2, 6.1, 7.1, 2.9, 3.5,   -, -,
+  //!                                 //        0.3, 2.9, 2.1, 6.1, 999.5, -, -]
+  //!    int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
+  //!    int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
+  //!    float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
+  //!    float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage  = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::HistogramEven(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_samples, d_histogram, num_levels, lower_level, upper_level,
+  //!        num_row_samples, num_rows, row_stride_bytes);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::HistogramEven(
+  //!        d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+  //!        d_samples, d_histogram, num_levels, lower_level, upper_level,
+  //!        num_row_samples, num_rows, row_stride_bytes);
+  //! 
+  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+  //!
+  //! @endrst
+  //! 
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //! 
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //! 
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //! 
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_samples
+  //!   The pointer to the input sequence of data samples.
+  //! 
+  //! @param[out] d_histogram
+  //!   The pointer to the histogram counter output array of
+  //!   length `num_levels - 1`.
+  //! 
+  //! @param[in] num_levels
+  //!   The number of boundaries (levels) for delineating histogram samples.
+  //!   Implies that the number of bins is `num_levels - 1`.
+  //! 
+  //! @param[in] lower_level
+  //!   The lower sample value bound (inclusive) for the lowest histogram bin.
+  //! 
+  //! @param[in] upper_level
+  //!   The upper sample value bound (exclusive) for the highest histogram bin.
+  //! 
+  //! @param[in] num_row_samples
+  //!   The number of data samples per row in the region of interest
+  //! 
+  //! @param[in] num_rows
+  //!   The number of rows in the region of interest
+  //! 
+  //! @param[in] row_stride_bytes
+  //!   The number of bytes between starts of consecutive rows in
+  //!   the region of interest
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename SampleIteratorT,
             typename CounterT,
             typename LevelT,
@@ -443,142 +437,140 @@ struct DeviceHistogram
                          stream);
   }
 
-  /**
-   * @brief Computes per-channel intensity histograms from a sequence of
-   *        multi-channel "pixel" data samples using equal-width bins.
-   *
-   * @par
-   * - The input is a sequence of *pixel* structures, where each pixel comprises
-   *   a record of `NUM_CHANNELS` consecutive data samples
-   *   (e.g., an *RGBA* pixel).
-   * - Of the `NUM_CHANNELS` specified, the function will only compute
-   *   histograms for the first `NUM_ACTIVE_CHANNELS`
-   *   (e.g., only *RGB* histograms from *RGBA* pixel samples).
-   * - The number of histogram bins for channel<sub><em>i</em></sub> is
-   *   `num_levels[i] - 1`.
-   * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-   *   have the same width:
-   *   `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
-   * - If the common type of sample and level is of integral type, the bin for a sample is
-   *   computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] -
-   *   lower_level[i])`, round down to the nearest whole number. To protect against potential
-   *   overflows, if, for any channel `i`, the product `(upper_level[i] - lower_level[i]) *
-   *   (num_levels[i] - 1)` exceeds the number representable by an `uint64_t`, the cuda error
-   *   `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation
-   *   will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin
-   *   computation would overflow for 128-bit arithmetic.
-   * - For a given channel `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges
-   *   `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` and
-   *   `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap
-   *   in any way.
-   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
-   *   and SampleT must be valid arithmetic types. The common type must be
-   *   convertible to `int` and trivially copyable.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
-   * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input samples and output histograms
-   * int              num_pixels;         // e.g., 5
-   * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
-   *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
-   * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
-   *                                      //       each allocated with 256 integer counters
-   * int              num_levels[3];      // e.g., {257, 257, 257};
-   * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
-   * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void*    d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceHistogram::MultiHistogramEven<4, 3>(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels,
-   *   lower_level, upper_level, num_pixels);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Compute histograms
-   * cub::DeviceHistogram::MultiHistogramEven<4, 3>(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels,
-   *   lower_level, upper_level, num_pixels);
-   *
-   * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
-   * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
-   * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
-   * @endcode
-   *
-   * @tparam NUM_CHANNELS
-   *   Number of channels interleaved in the input data (may be greater than
-   *   the number of channels being actively histogrammed)
-   *
-   * @tparam NUM_ACTIVE_CHANNELS
-   *   **[inferred]** Number of channels actively being histogrammed
-   *
-   * @tparam SampleIteratorT
-   *   **[inferred]** Random-access input iterator type for reading
-   *   input samples. \iterator
-   *
-   * @tparam CounterT
-   *   **[inferred]** Integer type for histogram bin counters
-   *
-   * @tparam LevelT
-   *   **[inferred]** Type for specifying boundaries (levels)
-   *
-   * @tparam OffsetT
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
-   *   pointer differences, etc. \offset_size1
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_samples
-   *   The pointer to the multi-channel input sequence of data samples.
-   *   The samples from different channels are assumed to be interleaved
-   *   (e.g., an array of 32-bit pixels where each pixel consists of four
-   *   *RGBA* 8-bit samples).
-   *
-   * @param[out] d_histogram
-   *   The pointers to the histogram counter output arrays, one for each active
-   *   channel. For channel<sub><em>i</em></sub>, the allocation length of
-   *   `d_histogram[i]` should be `num_levels[i] - 1`.
-   *
-   * @param[in] num_levels
-   *   The number of boundaries (levels) for delineating histogram samples in
-   *   each active channel. Implies that the number of bins for
-   *   channel<sub><em>i</em></sub> is `num_levels[i] - 1`.
-   *
-   * @param[in] lower_level
-   *   The lower sample value bound (inclusive) for the lowest histogram bin in
-   *   each active channel.
-   *
-   * @param[in] upper_level
-   *   The upper sample value bound (exclusive) for the highest histogram bin
-   *   in each active channel.
-   *
-   * @param[in] num_pixels
-   *   The number of multi-channel pixels
-   *   (i.e., the length of `d_samples / NUM_CHANNELS`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using 
+  //! equal-width bins.
+  //! 
+  //! - The input is a sequence of *pixel* structures, where each pixel comprises
+  //!   a record of ``NUM_CHANNELS`` consecutive data samples
+  //!   (e.g., an *RGBA* pixel).
+  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
+  //!   histograms for the first ``NUM_ACTIVE_CHANNELS``
+  //!   (e.g., only *RGB* histograms from *RGBA* pixel samples).
+  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
+  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
+  //! - If the common type of sample and level is of integral type, the bin for a sample is
+  //!   computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, round down 
+  //!   to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, the product 
+  //!   ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by an ``uint64_t``, 
+  //!   the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 bits wide, bin computation
+  //!   will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only be returned if bin
+  //!   computation would overflow for 128-bit arithmetic.
+  //! - For a given channel ``c`` in ``[0, NUM_ACTIVE_CHANNELS)``, the ranges
+  //!   ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` and
+  //!   ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way.
+  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT
+  //!   and SampleT must be valid arithmetic types. 
+  //!   The common type must be convertible to ``int`` and trivially copyable.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of three 256-bin *RGB* histograms
+  //! from a quad-channel sequence of *RGBA* pixels (8 bits per channel per pixel)
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input samples and output histograms
+  //!    int              num_pixels;         // e.g., 5
+  //!    unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+  //!                                         //        (0, 6, 7, 5), (3, 0, 2, 6)]
+  //!    int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+  //!                                         //       each allocated with 256 integer counters
+  //!    int              num_levels[3];      // e.g., {257, 257, 257};
+  //!    unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+  //!    unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      lower_level, upper_level, num_pixels);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      lower_level, upper_level, num_pixels);
+  //! 
+  //!    // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+  //!    //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+  //!    //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam NUM_CHANNELS
+  //!   Number of channels interleaved in the input data (may be greater than
+  //!   the number of channels being actively histogrammed)
+  //! 
+  //! @tparam NUM_ACTIVE_CHANNELS
+  //!   **[inferred]** Number of channels actively being histogrammed
+  //! 
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //! 
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //! 
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //! 
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_samples
+  //!   The pointer to the multi-channel input sequence of data samples.
+  //!   The samples from different channels are assumed to be interleaved
+  //!   (e.g., an array of 32-bit pixels where each pixel consists of four
+  //!   *RGBA* 8-bit samples).
+  //! 
+  //! @param[out] d_histogram
+  //!   @rst
+  //!   The pointers to the histogram counter output arrays, one for each active
+  //!   channel. For channel\ :sub:`i`, the allocation length of
+  //!   ``d_histogram[i]`` should be `num_levels[i] - 1``.
+  //!   @endrst
+  //! 
+  //! @param[in] num_levels
+  //!   @rst
+  //!   The number of boundaries (levels) for delineating histogram samples in each active channel. 
+  //!   Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //!   @endrst
+  //! 
+  //! @param[in] lower_level
+  //!   The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+  //! 
+  //! @param[in] upper_level
+  //!   The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+  //! 
+  //! @param[in] num_pixels
+  //!   The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -645,161 +637,158 @@ struct DeviceHistogram
                               stream);
   }
 
-  /**
-   * @brief Computes per-channel intensity histograms from a sequence of
-   *        multi-channel "pixel" data samples using equal-width bins.
-   *
-   * @par
-   * - The input is a sequence of *pixel* structures, where each pixel
-   *   comprises a record of `NUM_CHANNELS` consecutive data samples
-   *   (e.g., an *RGBA* pixel).
-   * - Of the `NUM_CHANNELS` specified, the function will only compute
-   *   histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., only *RGB*
-   *   histograms from *RGBA* pixel samples).
-   * - A two-dimensional *region of interest* within `d_samples` can be
-   *   specified using the `num_row_samples`, `num_rows`, and
-   *   `row_stride_bytes` parameters.
-   * - The row stride must be a whole multiple of the sample data type
-   *   size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
-   * - The number of histogram bins for channel<sub><em>i</em></sub> is
-   *   `num_levels[i] - 1`.
-   * - For channel<sub><em>i</em></sub>, the range of values for all histogram
-   *   bins have the same width:
-   *   `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
-   * - If the common type of sample and level is of integral type, the bin for a sample is
-   *   computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] -
-   *   lower_level[i])`, round down to the nearest whole number. To protect against potential
-   *   overflows, if, for any channel `i`, the product `(upper_level[i] - lower_level[i]) *
-   *   (num_levels[i] - 1)` exceeds the number representable by an `uint64_t`, the cuda error
-   *   `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation
-   *   will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin
-   *   computation would overflow for 128-bit arithmetic.
-   * - For a given row `r` in `[0, num_rows)`, and sample `s` in
-   *   `[0, num_row_pixels)`, let
-   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`,
-   *   `sample_begin = row_begin + s * NUM_CHANNELS`, and
-   *   `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For a given channel
-   *    `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges
-   *   `[sample_begin, sample_end)` and
-   *   `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap
-   *   in any way.
-   * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
-   *   and SampleT must be valid arithmetic types. The common type must be
-   *   convertible to `int` and trivially copyable.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the computation of three 256-bin
-   * *RGB* histograms from a 2x3 region of interest of within a flattened 2x4
-   * array of quad-channel *RGBA* pixels (8 bits per channel per pixel).
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for input
-   * // samples and output histograms
-   * int              num_row_pixels;     // e.g., 3
-   * int              num_rows;           // e.g., 2
-   * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
-   * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
-   *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
-   * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
-   *                                      //       each allocated with 256 integer counters
-   * int              num_levels[3];      // e.g., {257, 257, 257};
-   * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
-   * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void*    d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceHistogram::MultiHistogramEven<4, 3>(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, lower_level, upper_level,
-   *   num_row_pixels, num_rows, row_stride_bytes);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Compute histograms
-   * cub::DeviceHistogram::MultiHistogramEven<4, 3>(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, lower_level, upper_level,
-   *   num_row_pixels, num_rows, row_stride_bytes);
-   *
-   * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
-   * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
-   * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
-   * @endcode
-   *
-   * @tparam NUM_CHANNELS
-   *   Number of channels interleaved in the input data (may be greater than
-   *   the number of channels being actively histogrammed)
-   *
-   * @tparam NUM_ACTIVE_CHANNELS
-   *   **[inferred]** Number of channels actively being histogrammed
-   *
-   * @tparam SampleIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   samples. \iterator
-   *
-   * @tparam CounterT
-   *   **[inferred]** Integer type for histogram bin counters
-   *
-   * @tparam LevelT
-   *   **[inferred]** Type for specifying boundaries (levels)
-   *
-   * @tparam OffsetT
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
-   *   pointer differences, etc. \offset_size1
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_samples
-   *   The pointer to the multi-channel input sequence of data samples. The
-   *   samples from different channels are assumed to be interleaved (e.g.,
-   *   an array of 32-bit pixels where each pixel consists of four
-   *   *RGBA* 8-bit samples).
-   *
-   * @param[out] d_histogram
-   *   The pointers to the histogram counter output arrays, one for each
-   *   active channel. For channel<sub><em>i</em></sub>, the allocation length
-   *   of `d_histogram[i]` should be `num_levels[i] - 1`.
-   *
-   * @param[in] num_levels
-   *   The number of boundaries (levels) for delineating histogram samples in
-   *   each active channel. Implies that the number of bins for
-   *   channel<sub><em>i</em></sub> is `num_levels[i] - 1`.
-   *
-   * @param[in] lower_level
-   *   The lower sample value bound (inclusive) for the lowest histogram bin in
-   *   each active channel.
-   *
-   * @param[in] upper_level
-   *   The upper sample value bound (exclusive) for the highest histogram bin
-   *   in each active channel.
-   *
-   * @param[in] num_row_pixels
-   *   The number of multi-channel pixels per row in the region of interest
-   *
-   * @param[in] num_rows
-   *   The number of rows in the region of interest
-   *
-   * @param[in] row_stride_bytes
-   *   The number of bytes between starts of consecutive rows in the region of
-   *   interest
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes per-channel intensity histograms from a sequence of
+  //! multi-channel "pixel" data samples using equal-width bins.
+  //! 
+  //! - The input is a sequence of *pixel* structures, where each pixel
+  //!   comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel).
+  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
+  //!   histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., only *RGB*
+  //!   histograms from *RGBA* pixel samples).
+  //! - A two-dimensional *region of interest* within ``d_samples`` can be
+  //!   specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
+  //! - The row stride must be a whole multiple of the sample data type
+  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
+  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
+  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
+  //! - If the common type of sample and level is of integral type, the bin for a sample is
+  //!   computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, 
+  //!   round down to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, 
+  //!   the product ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by 
+  //!   an ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. 
+  //!   If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` 
+  //!   will only be returned if bin computation would overflow for 128-bit arithmetic.
+  //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in
+  //!   ``[0, num_row_pixels)``, let
+  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``,
+  //!   ``sample_begin = row_begin + s * NUM_CHANNELS``, and
+  //!   ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For a given channel
+  //!    ``c`` in ``[0, NUM_ACTIVE_CHANNELS)``, the ranges
+  //!   ``[sample_begin, sample_end)`` and
+  //!   ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way.
+  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT
+  //!   and SampleT must be valid arithmetic types. The common type must be
+  //!   convertible to ``int`` and trivially copyable.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of three 256-bin
+  //! *RGB* histograms from a 2x3 region of interest of within a flattened 2x4
+  //! array of quad-channel *RGBA* pixels (8 bits per channel per pixel).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for input
+  //!    // samples and output histograms
+  //!    int              num_row_pixels;     // e.g., 3
+  //!    int              num_rows;           // e.g., 2
+  //!    size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+  //!    unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+  //!                                         //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+  //!    int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+  //!                                         //       each allocated with 256 integer counters
+  //!    int              num_levels[3];      // e.g., {257, 257, 257};
+  //!    unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+  //!    unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, lower_level, upper_level,
+  //!      num_row_pixels, num_rows, row_stride_bytes);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, lower_level, upper_level,
+  //!      num_row_pixels, num_rows, row_stride_bytes);
+  //!
+  //!    // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+  //!    //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+  //!    //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam NUM_CHANNELS
+  //!   Number of channels interleaved in the input data (may be greater than
+  //!   the number of channels being actively histogrammed)
+  //! 
+  //! @tparam NUM_ACTIVE_CHANNELS
+  //!   **[inferred]** Number of channels actively being histogrammed
+  //! 
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input
+  //!   samples. @iterator
+  //! 
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //! 
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //! 
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_samples
+  //!   The pointer to the multi-channel input sequence of data samples. The
+  //!   samples from different channels are assumed to be interleaved (e.g.,
+  //!   an array of 32-bit pixels where each pixel consists of four
+  //!   *RGBA* 8-bit samples).
+  //! 
+  //! @param[out] d_histogram
+  //!   @rst
+  //!   The pointers to the histogram counter output arrays, one for each
+  //!   active channel. For channel\ :sub:`i`, the allocation length
+  //!   of ``d_histogram[i]`` should be ``num_levels[i] - 1``.
+  //!   @endrst
+  //! 
+  //! @param[in] num_levels
+  //!   @rst
+  //!   The number of boundaries (levels) for delineating histogram samples in each active channel. 
+  //!   Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //!   @endrst
+  //! 
+  //! @param[in] lower_level
+  //!   The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+  //! 
+  //! @param[in] upper_level
+  //!   The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+  //! 
+  //! @param[in] num_row_pixels
+  //!   The number of multi-channel pixels per row in the region of interest
+  //! 
+  //! @param[in] num_rows
+  //!   The number of rows in the region of interest
+  //! 
+  //! @param[in] row_stride_bytes
+  //!   The number of bytes between starts of consecutive rows in the region of
+  //!   interest
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -905,107 +894,105 @@ struct DeviceHistogram
                               stream);
   }
 
-  //@}  end member group
-  /******************************************************************//**
-   * @name Custom bin ranges
-   *********************************************************************/
-  //@{
+  //! @}  end member group
+  //! @name Custom bin ranges
+  //! @{
 
-  /**
-   * @brief Computes an intensity histogram from a sequence of data samples
-   *        using the specified bin boundary levels.
-   *
-   * @par
-   * - The number of histogram bins is (`num_levels - 1`)
-   * - The value range for bin<sub><em>i</em></sub> is `[level[i], level[i+1])`
-   * - The range `[d_histogram, d_histogram + num_levels - 1)` shall not
-   *   overlap `[d_samples, d_samples + num_samples)` nor
-   *   `[d_levels, d_levels + num_levels)` in any way. The ranges
-   *   `[d_levels, d_levels + num_levels)` and
-   *   `[d_samples, d_samples + num_samples)` may overlap.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the computation of an six-bin histogram
-   * from a sequence of float samples
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for input
-   * // samples and output histogram
-   * int      num_samples;    // e.g., 10
-   * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
-   * int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
-   * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
-   * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void*    d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceHistogram::HistogramRange(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, d_levels, num_samples);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Compute histograms
-   * cub::DeviceHistogram::HistogramRange(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, d_levels, num_samples);
-   *
-   * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
-   *
-   * @endcode
-   *
-   * @tparam SampleIteratorT
-   *   **[inferred]** Random-access input iterator type for reading
-   *   input samples.\iterator
-   *
-   * @tparam CounterT
-   *   **[inferred]** Integer type for histogram bin counters
-   *
-   * @tparam LevelT
-   *   **[inferred]** Type for specifying boundaries (levels)
-   *
-   * @tparam OffsetT
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
-   *   pointer differences, etc. \offset_size1
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_samples
-   *   The pointer to the input sequence of data samples.
-   *
-   * @param[out] d_histogram
-   *   The pointer to the histogram counter output array of length
-   *   `num_levels - 1`.
-   *
-   * @param[in] num_levels
-   *   The number of boundaries (levels) for delineating histogram samples.
-   *   Implies that the number of bins is `num_levels - 1`.
-   *
-   * @param[in] d_levels
-   *   The pointer to the array of boundaries (levels). Bin ranges are defined
-   *   by consecutive boundary pairings: lower sample value boundaries are
-   *   inclusive and upper sample value boundaries are exclusive.
-   *
-   * @param[in] num_samples
-   *   The number of data samples per row in the region of interest
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+  //! 
+  //! - The number of histogram bins is (``num_levels - 1``)
+  //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])``
+  //! - The range ``[d_histogram, d_histogram + num_levels - 1)`` shall not
+  //!   overlap ``[d_samples, d_samples + num_samples)`` nor
+  //!   ``[d_levels, d_levels + num_levels)`` in any way. The ranges
+  //!   ``[d_levels, d_levels + num_levels)`` and
+  //!   ``[d_samples, d_samples + num_samples)`` may overlap.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of an six-bin histogram
+  //! from a sequence of float samples
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for input
+  //!    // samples and output histogram
+  //!    int      num_samples;    // e.g., 10
+  //!    float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+  //!    int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
+  //!    int      num_levels      // e.g., 7 (seven level boundaries for six bins)
+  //!    float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::HistogramRange(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels, num_samples);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::HistogramRange(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels, num_samples);
+  //! 
+  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //! 
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //! 
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //! 
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_samples
+  //!   The pointer to the input sequence of data samples.
+  //! 
+  //! @param[out] d_histogram
+  //!   The pointer to the histogram counter output array of length
+  //!   `num_levels - 1`.
+  //! 
+  //! @param[in] num_levels
+  //!   The number of boundaries (levels) for delineating histogram samples.
+  //!   Implies that the number of bins is `num_levels - 1`.
+  //! 
+  //! @param[in] d_levels
+  //!   The pointer to the array of boundaries (levels). Bin ranges are defined
+  //!   by consecutive boundary pairings: lower sample value boundaries are
+  //!   inclusive and upper sample value boundaries are exclusive.
+  //! 
+  //! @param[in] num_samples
+  //!   The number of data samples per row in the region of interest
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename SampleIteratorT,
             typename CounterT,
             typename LevelT,
@@ -1067,119 +1054,118 @@ struct DeviceHistogram
                           stream);
   }
 
-  /**
-   * @brief Computes an intensity histogram from a sequence of data samples
-   *        using the specified bin boundary levels.
-   *
-   * @par
-   * - A two-dimensional *region of interest* within `d_samples` can be
-   *   specified using the `num_row_samples`, `num_rows`, and
-   *   `row_stride_bytes` parameters.
-   * - The row stride must be a whole multiple of the sample data type
-   *   size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
-   * - The number of histogram bins is (`num_levels - 1`)
-   * - The value range for bin<sub><em>i</em></sub> is `[level[i], level[i+1])`
-   * - For a given row `r` in `[0, num_rows)`, let
-   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and
-   *   `row_end = row_begin + num_row_samples`. The range
-   *   `[d_histogram, d_histogram + num_levels - 1)` shall not overlap
-   *   `[row_begin, row_end)` nor `[d_levels, d_levels + num_levels)`.
-   *   The ranges `[d_levels, d_levels + num_levels)` and `[row_begin, row_end)`
-   *   may overlap.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the computation of a six-bin histogram
-   * from a 2x5 region of interest within a flattened 2x7 array of float samples.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for input samples and
-   * // output histogram
-   * int      num_row_samples;    // e.g., 5
-   * int      num_rows;           // e.g., 2;
-   * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
-   * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
-   *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
-   * int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
-   * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
-   * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void*    d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceHistogram::HistogramRange(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, d_levels,
-   *   num_row_samples, num_rows, row_stride_bytes);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Compute histograms
-   * cub::DeviceHistogram::HistogramRange(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, d_levels,
-   *   num_row_samples, num_rows, row_stride_bytes);
-   *
-   * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
-   * @endcode
-   *
-   * @tparam SampleIteratorT
-   *   **[inferred]** Random-access input iterator type for reading
-   *   input samples. \iterator
-   *
-   * @tparam CounterT
-   *   **[inferred]** Integer type for histogram bin counters
-   *
-   * @tparam LevelT
-   *   **[inferred]** Type for specifying boundaries (levels)
-   *
-   * @tparam OffsetT
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
-   *   pointer differences, etc. \offset_size1
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_samples
-   *   The pointer to the input sequence of data samples.
-   *
-   * @param[out] d_histogram
-   *   The pointer to the histogram counter output array of length
-   *   `num_levels - 1`.
-   *
-   * @param[in] num_levels
-   *   The number of boundaries (levels) for delineating histogram samples.
-   *   Implies that the number of bins is `num_levels - 1`.
-   *
-   * @param[in] d_levels
-   *   The pointer to the array of boundaries (levels). Bin ranges are defined
-   *   by consecutive boundary pairings: lower sample value boundaries are
-   *   inclusive and upper sample value boundaries are exclusive.
-   *
-   * @param[in] num_row_samples
-   *   The number of data samples per row in the region of interest
-   *
-   * @param[in] num_rows
-   *   The number of rows in the region of interest
-   *
-   * @param[in] row_stride_bytes
-   *   The number of bytes between starts of consecutive rows in the region
-   *   of interest
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+  //! 
+  //! - A two-dimensional *region of interest* within ``d_samples`` can be
+  //!   specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
+  //! - The row stride must be a whole multiple of the sample data type
+  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
+  //! - The number of histogram bins is (``num_levels - 1``)
+  //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])``
+  //! - For a given row ``r`` in ``[0, num_rows)``, let
+  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and
+  //!   ``row_end = row_begin + num_row_samples``. The range
+  //!   ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap
+  //!   ``[row_begin, row_end)`` nor ``[d_levels, d_levels + num_levels)``.
+  //!   The ranges ``[d_levels, d_levels + num_levels)`` and ``[row_begin, row_end)`` may overlap.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of a six-bin histogram
+  //! from a 2x5 region of interest within a flattened 2x7 array of float samples.
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for input samples and
+  //!    // output histogram
+  //!    int      num_row_samples;    // e.g., 5
+  //!    int      num_rows;           // e.g., 2;
+  //!    int      row_stride_bytes;   // e.g., 7 * sizeof(float)
+  //!    float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+  //!                                 //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+  //!    int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
+  //!    int      num_levels          // e.g., 7 (seven level boundaries for six bins)
+  //!    float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::HistogramRange(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels,
+  //!      num_row_samples, num_rows, row_stride_bytes);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::HistogramRange(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels,
+  //!      num_row_samples, num_rows, row_stride_bytes);
+  //! 
+  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+  //!
+  //! @endrst
+  //! 
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //! 
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //! 
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //! 
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_samples
+  //!   The pointer to the input sequence of data samples.
+  //! 
+  //! @param[out] d_histogram
+  //!   The pointer to the histogram counter output array of length
+  //!   `num_levels - 1`.
+  //! 
+  //! @param[in] num_levels
+  //!   The number of boundaries (levels) for delineating histogram samples.
+  //!   Implies that the number of bins is `num_levels - 1`.
+  //! 
+  //! @param[in] d_levels
+  //!   The pointer to the array of boundaries (levels). Bin ranges are defined
+  //!   by consecutive boundary pairings: lower sample value boundaries are
+  //!   inclusive and upper sample value boundaries are exclusive.
+  //! 
+  //! @param[in] num_row_samples
+  //!   The number of data samples per row in the region of interest
+  //! 
+  //! @param[in] num_rows
+  //!   The number of rows in the region of interest
+  //! 
+  //! @param[in] row_stride_bytes
+  //!   The number of bytes between starts of consecutive rows in the region
+  //!   of interest
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename SampleIteratorT,
             typename CounterT,
             typename LevelT,
@@ -1244,132 +1230,131 @@ struct DeviceHistogram
                           stream);
   }
 
-  /**
-   * @brief Computes per-channel intensity histograms from a sequence of
-   *        multi-channel "pixel" data samples using the specified bin
-   *        boundary levels.
-   *
-   * @par
-   * - The input is a sequence of *pixel* structures, where each pixel
-   *   comprises a record of `NUM_CHANNELS` consecutive data samples
-   *   (e.g., an *RGBA* pixel).
-   * - Of the `NUM_CHANNELS` specified, the function will only compute
-   *   histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms
-   *   from *RGBA* pixel samples).
-   * - The number of histogram bins for channel<sub><em>i</em></sub> is
-   *   `num_levels[i] - 1`.
-   * - For channel<sub><em>i</em></sub>, the range of values for all histogram
-   *   bins have the same width:
-   *   `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
-   * - For given channels `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the
-   *   range `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall
-   *   not overlap `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` nor
-   *   `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way.
-   *   The ranges `[d_levels[c2], d_levels[c2] + num_levels[c2])` and
-   *   `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` may overlap.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the computation of three 4-bin *RGB*
-   * histograms from a quad-channel sequence of *RGBA* pixels
-   * (8 bits per channel per pixel)
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input samples and output histograms
-   * int            num_pixels;       // e.g., 5
-   * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
-   *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
-   * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
-   * int            num_levels[3];    // e.g., {5, 5, 5};
-   * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
-   *                                  //         [0, 2, 4, 6, 8],
-   *                                  //         [0, 2, 4, 6, 8] ];
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void*    d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceHistogram::MultiHistogramRange<4, 3>(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, d_levels, num_pixels);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Compute histograms
-   * cub::DeviceHistogram::MultiHistogramRange<4, 3>(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, d_levels, num_pixels);
-   *
-   * // d_histogram   <-- [ [1, 3, 0, 1],
-   * //                     [3, 0, 0, 2],
-   * //                     [0, 2, 0, 3] ]
-   *
-   * @endcode
-   *
-   * @tparam NUM_CHANNELS
-   *   Number of channels interleaved in the input data (may be greater than
-   *   the number of channels being actively histogrammed)
-   *
-   * @tparam NUM_ACTIVE_CHANNELS
-   *   **[inferred]** Number of channels actively being histogrammed
-   *
-   * @tparam SampleIteratorT
-   *   **[inferred]** Random-access input iterator type for reading
-   *   input samples. \iterator
-   *
-   * @tparam CounterT
-   *   **[inferred]** Integer type for histogram bin counters
-   *
-   * @tparam LevelT
-   *   **[inferred]** Type for specifying boundaries (levels)
-   *
-   * @tparam OffsetT
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
-   *   pointer differences, etc. \offset_size1
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_samples
-   *   The pointer to the multi-channel input sequence of data samples.
-   *   The samples from different channels are assumed to be interleaved (e.g.,
-   *   an array of 32-bit pixels where each pixel consists of four *RGBA*
-   *   8-bit samples).
-   *
-   * @param[out] d_histogram
-   *   The pointers to the histogram counter output arrays, one for each active
-   *   channel. For channel<sub><em>i</em></sub>, the allocation length of
-   *   `d_histogram[i]` should be `num_levels[i] - 1`.
-   *
-   * @param[in] num_levels
-   *   The number of boundaries (levels) for delineating histogram samples in
-   *   each active channel. Implies that the number of bins for
-   *   channel<sub><em>i</em></sub> is `num_levels[i] - 1`.
-   *
-   * @param[in] d_levels
-   *   The pointers to the arrays of boundaries (levels), one for each active
-   *   channel. Bin ranges are defined by consecutive boundary pairings: lower
-   *   sample value boundaries are inclusive and upper sample value boundaries
-   *   are exclusive.
-   *
-   * @param[in] num_pixels
-   *   The number of multi-channel pixels
-   *   (i.e., the length of `d_samples / NUM_CHANNELS`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples 
+  //! using the specified bin boundary levels.
+  //! 
+  //! - The input is a sequence of *pixel* structures, where each pixel
+  //!   comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel).
+  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
+  //!   histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples).
+  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
+  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
+  //! - For given channels ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the
+  //!   range ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall
+  //!   not overlap ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` nor
+  //!   ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way.
+  //!   The ranges ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and
+  //!   ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` may overlap.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of three 4-bin *RGB*
+  //! histograms from a quad-channel sequence of *RGBA* pixels
+  //! (8 bits per channel per pixel)
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input samples and output histograms
+  //!    int            num_pixels;       // e.g., 5
+  //!    unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+  //!                                     //        (0, 6, 7, 5),(3, 0, 2, 6)]
+  //!    unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+  //!    int            num_levels[3];    // e.g., {5, 5, 5};
+  //!    unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
+  //!                                     //         [0, 2, 4, 6, 8],
+  //!                                     //         [0, 2, 4, 6, 8] ];
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels, num_pixels);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels, num_pixels);
+  //! 
+  //!    // d_histogram   <-- [ [1, 3, 0, 1],
+  //!    //                     [3, 0, 0, 2],
+  //!    //                     [0, 2, 0, 3] ]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam NUM_CHANNELS
+  //!   Number of channels interleaved in the input data (may be greater than
+  //!   the number of channels being actively histogrammed)
+  //! 
+  //! @tparam NUM_ACTIVE_CHANNELS
+  //!   **[inferred]** Number of channels actively being histogrammed
+  //! 
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //! 
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //! 
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //! 
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_samples
+  //!   The pointer to the multi-channel input sequence of data samples.
+  //!   The samples from different channels are assumed to be interleaved (e.g.,
+  //!   an array of 32-bit pixels where each pixel consists of four *RGBA*
+  //!   8-bit samples).
+  //! 
+  //! @param[out] d_histogram
+  //!   @rst
+  //!   The pointers to the histogram counter output arrays, one for each active
+  //!   channel. For channel\ :sub:`i`, the allocation length of
+  //!   ``d_histogram[i]`` should be ``num_levels[i] - 1``.
+  //!   @endrst
+  //! 
+  //! @param[in] num_levels
+  //!   @rst
+  //!   The number of boundaries (levels) for delineating histogram samples in
+  //!   each active channel. Implies that the number of bins for
+  //!   channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //!   @endrst
+  //! 
+  //! @param[in] d_levels
+  //!   The pointers to the arrays of boundaries (levels), one for each active
+  //!   channel. Bin ranges are defined by consecutive boundary pairings: lower
+  //!   sample value boundaries are inclusive and upper sample value boundaries
+  //!   are exclusive.
+  //! 
+  //! @param[in] num_pixels
+  //!   The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -1432,151 +1417,150 @@ struct DeviceHistogram
                                stream);
   }
 
-  /**
-   * @brief Computes per-channel intensity histograms from a sequence of
-   *        multi-channel "pixel" data samples using the specified bin boundary
-   *        levels.
-   *
-   * @par
-   * - The input is a sequence of *pixel* structures, where each pixel comprises
-   *   a record of `NUM_CHANNELS` consecutive data samples
-   *   (e.g., an *RGBA* pixel).
-   * - Of the `NUM_CHANNELS` specified, the function will only compute
-   *   histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms
-   *   from *RGBA* pixel samples).
-   * - A two-dimensional *region of interest* within `d_samples` can be
-   *   specified using the `num_row_samples`, `num_rows`, and `row_stride_bytes`
-   *   parameters.
-   * - The row stride must be a whole multiple of the sample data type
-   *   size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
-   * - The number of histogram bins for channel<sub><em>i</em></sub> is
-   *   `num_levels[i] - 1`.
-   * - For channel<sub><em>i</em></sub>, the range of values for all histogram
-   *   bins have the same width:
-   *   `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
-   * - For a given row `r` in `[0, num_rows)`, and sample `s` in
-   *   `[0, num_row_pixels)`, let
-   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`,
-   *   `sample_begin = row_begin + s * NUM_CHANNELS`, and
-   *   `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For given channels
-   *    `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the range
-   *   `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall not
-   *   overlap `[sample_begin, sample_end)` nor
-   *   `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. The ranges
-   *   `[d_levels[c2], d_levels[c2] + num_levels[c2])` and
-   *   `[sample_begin, sample_end)` may overlap.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the computation of three 4-bin *RGB*
-   * histograms from a 2x3 region of interest of within a flattened 2x4 array
-   * of quad-channel *RGBA* pixels (8 bits per channel per pixel).
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for input
-   * // samples and output histograms
-   * int              num_row_pixels;     // e.g., 3
-   * int              num_rows;           // e.g., 2
-   * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
-   * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
-   *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
-   * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
-   * int              num_levels[3];      // e.g., {5, 5, 5};
-   * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
-   *                                      //         [0, 2, 4, 6, 8],
-   *                                      //         [0, 2, 4, 6, 8] ];
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void*    d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceHistogram::MultiHistogramRange<4, 3>(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, d_levels,
-   *   num_row_pixels, num_rows, row_stride_bytes);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Compute histograms
-   * cub::DeviceHistogram::MultiHistogramRange<4, 3>(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels,
-   *   d_levels, num_row_pixels, num_rows, row_stride_bytes);
-   *
-   * // d_histogram   <-- [ [2, 3, 0, 1],
-   * //                     [3, 0, 0, 2],
-   * //                     [1, 2, 0, 3] ]
-   *
-   * @endcode
-   *
-   * @tparam NUM_CHANNELS
-   *   Number of channels interleaved in the input data (may be greater than
-   *   the number of channels being actively histogrammed)
-   *
-   * @tparam NUM_ACTIVE_CHANNELS
-   *   **[inferred]** Number of channels actively being histogrammed
-   *
-   * @tparam SampleIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   samples. \iterator
-   *
-   * @tparam CounterT
-   *   **[inferred]** Integer type for histogram bin counters
-   *
-   * @tparam LevelT
-   *   **[inferred]** Type for specifying boundaries (levels)
-   *
-   * @tparam OffsetT
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
-   *   pointer differences, etc.  \offset_size1
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to \p temp_storage_bytes and no work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_samples
-   *   The pointer to the multi-channel input sequence of data samples. The
-   *   samples from different channels are assumed to be interleaved (e.g., an
-   *   array of 32-bit pixels where each pixel consists of four
-   *   *RGBA* 8-bit samples).
-   *
-   * @param[out] d_histogram
-   *   The pointers to the histogram counter output arrays, one for each active
-   *   channel. For channel<sub><em>i</em></sub>, the allocation length of
-   *   `d_histogram[i]` should be `num_levels[i] - 1`.
-   *
-   * @param[in] num_levels
-   *   The number of boundaries (levels) for delineating histogram samples in
-   *   each active channel. Implies that the number of bins for
-   *   channel<sub><em>i</em></sub> is `num_levels[i] - 1`.
-   *
-   * @param[in] d_levels
-   *   The pointers to the arrays of boundaries (levels), one for each active
-   *   channel. Bin ranges are defined by consecutive boundary pairings: lower
-   *   sample value boundaries are inclusive and upper sample value boundaries
-   *   are exclusive.
-   *
-   * @param[in] num_row_pixels
-   *   The number of multi-channel pixels per row in the region of interest
-   *
-   * @param[in] num_rows
-   *   The number of rows in the region of interest
-   *
-   * @param[in] row_stride_bytes
-   *   The number of bytes between starts of consecutive rows in the
-   *   region of interest
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using 
+  //! the specified bin boundary levels.
+  //! 
+  //! - The input is a sequence of *pixel* structures, where each pixel comprises
+  //!   a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel).
+  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
+  //!   histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples).
+  //! - A two-dimensional *region of interest* within ``d_samples`` can be
+  //!   specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
+  //! - The row stride must be a whole multiple of the sample data type
+  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
+  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
+  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
+  //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in ``[0, num_row_pixels)``, let
+  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``,
+  //!   ``sample_begin = row_begin + s * NUM_CHANNELS``, and
+  //!   ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For given channels
+  //!   ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the range
+  //!   ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall not overlap 
+  //!   ``[sample_begin, sample_end)`` nor
+  //!   ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way. The ranges
+  //!   ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and
+  //!   ``[sample_begin, sample_end)`` may overlap.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of three 4-bin *RGB*
+  //! histograms from a 2x3 region of interest of within a flattened 2x4 array
+  //! of quad-channel *RGBA* pixels (8 bits per channel per pixel).
+  //! 
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for input
+  //!    // samples and output histograms
+  //!    int              num_row_pixels;     // e.g., 3
+  //!    int              num_rows;           // e.g., 2
+  //!    size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+  //!    unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+  //!                                         //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+  //!    int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+  //!    int              num_levels[3];      // e.g., {5, 5, 5};
+  //!    unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
+  //!                                         //         [0, 2, 4, 6, 8],
+  //!                                         //         [0, 2, 4, 6, 8] ];
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels,
+  //!      num_row_pixels, num_rows, row_stride_bytes);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      d_levels, num_row_pixels, num_rows, row_stride_bytes);
+  //! 
+  //!    // d_histogram   <-- [ [2, 3, 0, 1],
+  //!    //                     [3, 0, 0, 2],
+  //!    //                     [1, 2, 0, 3] ]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam NUM_CHANNELS
+  //!   Number of channels interleaved in the input data (may be greater than
+  //!   the number of channels being actively histogrammed)
+  //! 
+  //! @tparam NUM_ACTIVE_CHANNELS
+  //!   **[inferred]** Number of channels actively being histogrammed
+  //! 
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input
+  //!   samples. @iterator
+  //! 
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //! 
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //! 
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_samples
+  //!   The pointer to the multi-channel input sequence of data samples. The
+  //!   samples from different channels are assumed to be interleaved (e.g., an
+  //!   array of 32-bit pixels where each pixel consists of four
+  //!   *RGBA* 8-bit samples).
+  //! 
+  //! @param[out] d_histogram
+  //!   @rst
+  //!   The pointers to the histogram counter output arrays, one for each active
+  //!   channel. For channel\ :sub:`i`, the allocation length of
+  //!   ``d_histogram[i]`` should be ``num_levels[i] - 1``.
+  //!   @endrst
+  //! 
+  //! @param[in] num_levels
+  //!   @rst
+  //!   The number of boundaries (levels) for delineating histogram samples in
+  //!   each active channel. Implies that the number of bins for
+  //!   channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //!   @endrst
+  //! 
+  //! @param[in] d_levels
+  //!   The pointers to the arrays of boundaries (levels), one for each active
+  //!   channel. Bin ranges are defined by consecutive boundary pairings: lower
+  //!   sample value boundaries are inclusive and upper sample value boundaries
+  //!   are exclusive.
+  //! 
+  //! @param[in] num_row_pixels
+  //!   The number of multi-channel pixels per row in the region of interest
+  //! 
+  //! @param[in] num_rows
+  //!   The number of rows in the region of interest
+  //! 
+  //! @param[in] row_stride_bytes
+  //!   The number of bytes between starts of consecutive rows in the
+  //!   region of interest
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh
index 2263d7cd18..7e6b1d1c08 100644
--- a/cub/cub/device/device_memcpy.cuh
+++ b/cub/cub/device/device_memcpy.cuh
@@ -25,10 +25,7 @@
  *
  ******************************************************************************/
 
-/**
- * \file
- * cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
- */
+//! @file cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
 
 #pragma once
 
@@ -49,102 +46,122 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @brief cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
- * \ingroup SingleModule
- */
+//! @brief cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
 struct DeviceMemcpy
 {
-  /**
-   * @brief Copies data from a batch of given source buffers to their corresponding destination
-   * buffer.
-   * @note If any input buffer aliases memory from any output buffer the behavior is undefined. If
-   * any output buffer aliases memory of another output buffer the behavior is undefined. Input
-   * buffers can alias one another.
-   *
-   * @par Snippet
-   * The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing
-   * a single string buffer.
-   * @par
-   * @code
-   * struct GetPtrToStringItem
-   * {
-   *   __host__ __device__ __forceinline__ void *operator()(uint32_t index)
-   *   {
-   *     return &d_string_data_in[d_string_offsets[index]];
-   *   }
-   *   char *d_string_data_in;
-   *   uint32_t *d_string_offsets;
-   * };
-   *
-   * struct GetStringItemSize
-   * {
-   *   __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
-   *   {
-   *     return d_string_offsets[index + 1] - d_string_offsets[index];
-   *   }
-   *   uint32_t *d_string_offsets;
-   * };
-   *
-   * uint32_t num_strings = 5;
-   * char *d_string_data_in;         // e.g., "TomatoesBananasApplesOrangesGrapes"
-   * char *d_string_data_out;        // e.g., "                ...               "
-   * uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34]
-   * uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34]
-   * uint32_t *d_gather_index;       // e.g., [2, 1, 4, 3, 0]
-   *
-   * // Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced
-   * auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0),
-   * d_gather_index);
-   *
-   * // Returns pointers to the input buffer for each string
-   * auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator,
-   *                                                    GetPtrToStringItem{d_string_data_in,
-   * d_string_offsets_old});
-   *
-   * // Returns the string size of the i-th string
-   * auto str_sizes = thrust::make_transform_iterator(gather_iterator,
-   * GetStringItemSize{d_string_offsets_old});
-   *
-   * // Returns pointers to the output buffer for each string
-   * auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
-   *                                                     GetPtrToStringItem{d_string_data_out,
-   * d_string_offsets_new});
-   *
-   * // Determine temporary device storage requirements
-   * void *d_temp_storage      = nullptr;
-   * size_t temp_storage_bytes = 0;
-   * cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
-   * str_sizes, num_strings);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run batched copy algorithm (used to permute strings)
-   * cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
-   * str_sizes, num_strings);
-   *
-   * // d_string_data_out       <-- "ApplesBananasGrapesOrangesTomatoe"
-   * @endcode
-   * @tparam InputBufferIt <b>[inferred]</b> Device-accessible random-access input iterator type
-   * providing the pointers to the source memory buffers
-   * @tparam OutputBufferIt <b>[inferred]</b> Device-accessible random-access input iterator type
-   * providing the pointers to the destination memory buffers
-   * @tparam BufferSizeIteratorT <b>[inferred]</b> Device-accessible random-access input iterator
-   * type providing the number of bytes to be copied for each pair of buffers
-   * @param d_temp_storage [in] Device-accessible allocation of temporary storage.  When NULL, the
-   * required allocation size is written to \p temp_storage_bytes and no work is done.
-   * @param temp_storage_bytes [in,out] Reference to size in bytes of \p d_temp_storage allocation
-   * @param input_buffer_it [in] Device-accessible iterator providing the pointers to the source
-   * memory buffers
-   * @param output_buffer_it [in] Device-accessible iterator providing the pointers to the
-   * destination memory buffers
-   * @param buffer_sizes [in] Device-accessible iterator providing the number of bytes to be copied
-   * for each pair of buffers
-   * @param num_buffers [in] The total number of buffer pairs
-   * @param stream [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   * stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Copies data from a batch of given source buffers to their corresponding destination buffer.
+  //!
+  //! .. note::
+  //!
+  //!    If any input buffer aliases memory from any output buffer the behavior is undefined. 
+  //!    If any output buffer aliases memory of another output buffer the behavior is undefined. 
+  //!    Input buffers can alias one another.
+  //! 
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing
+  //! a single string buffer.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    struct GetPtrToStringItem
+  //!    {
+  //!      __host__ __device__ __forceinline__ void *operator()(uint32_t index)
+  //!      {
+  //!        return &d_string_data_in[d_string_offsets[index]];
+  //!      }
+  //!      char *d_string_data_in;
+  //!      uint32_t *d_string_offsets;
+  //!    };
+  //! 
+  //!    struct GetStringItemSize
+  //!    {
+  //!      __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
+  //!      {
+  //!        return d_string_offsets[index + 1] - d_string_offsets[index];
+  //!      }
+  //!      uint32_t *d_string_offsets;
+  //!    };
+  //! 
+  //!    uint32_t num_strings = 5;
+  //!    char *d_string_data_in;         // e.g., "TomatoesBananasApplesOrangesGrapes"
+  //!    char *d_string_data_out;        // e.g., "                ...               "
+  //!    uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34]
+  //!    uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34]
+  //!    uint32_t *d_gather_index;       // e.g., [2, 1, 4, 3, 0]
+  //! 
+  //!    // Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced
+  //!    auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0),
+  //!    d_gather_index);
+  //! 
+  //!    // Returns pointers to the input buffer for each string
+  //!    auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator,
+  //!                                                       GetPtrToStringItem{d_string_data_in,
+  //!    d_string_offsets_old});
+  //! 
+  //!    // Returns the string size of the i-th string
+  //!    auto str_sizes = thrust::make_transform_iterator(gather_iterator,
+  //!    GetStringItemSize{d_string_offsets_old});
+  //! 
+  //!    // Returns pointers to the output buffer for each string
+  //!    auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+  //!                                                        GetPtrToStringItem{d_string_data_out,
+  //!    d_string_offsets_new});
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage      = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
+  //!    str_sizes, num_strings);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run batched copy algorithm (used to permute strings)
+  //!    cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
+  //!    str_sizes, num_strings);
+  //! 
+  //!    // d_string_data_out       <-- "ApplesBananasGrapesOrangesTomatoe"
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputBufferIt 
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to 
+  //!   the source memory buffers
+  //!
+  //! @tparam OutputBufferIt 
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to 
+  //!   the destination memory buffers
+  //!
+  //! @tparam BufferSizeIteratorT 
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the number of bytes 
+  //!   to be copied for each pair of buffers
+  //!
+  //! @param[in] d_temp_storage 
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes 
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] input_buffer_it 
+  //!   Device-accessible iterator providing the pointers to the source memory buffers
+  //!
+  //! @param[in] output_buffer_it 
+  //!   Device-accessible iterator providing the pointers to the destination memory buffers
+  //!
+  //! @param[in] buffer_sizes 
+  //!   Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
+  //!
+  //! @param[in] num_buffers 
+  //!   The total number of buffer pairs
+  //!
+  //! @param[in] stream 
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t Batched(void *d_temp_storage,
                                                   size_t &temp_storage_bytes,
diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh
index 84ef2b3d2f..e68dcaa46e 100644
--- a/cub/cub/device/device_merge_sort.cuh
+++ b/cub/cub/device/device_merge_sort.cuh
@@ -49,8 +49,6 @@ CUB_NAMESPACE_BEGIN
  *        computing a merge sort across a sequence of data items residing within
  *        device-accessible memory.
  *
- * @ingroup SingleModule
- *
  * @par Overview
  * - DeviceMergeSort arranges items into ascending order using a comparison
  *   functor with less-than semantics. Merge sort can handle arbitrary types (as
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index ee3721f70f..eb3c978a2e 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -26,11 +26,8 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * cub::DevicePartition provides device-wide, parallel operations for
- * partitioning sequences of data items residing within device-accessible memory.
- */
+//! @file cub::DevicePartition provides device-wide, parallel operations for
+//!       partitioning sequences of data items residing within device-accessible memory.
 
 #pragma once
 
@@ -54,132 +51,122 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @brief DevicePartition provides device-wide, parallel operations for
- *        partitioning sequences of data items residing within device-accessible
- *        memory. ![](partition_logo.png)
- * @ingroup SingleModule
- *
- * @par Overview
- * These operations apply a selection criterion to construct a partitioned
- * output sequence from items selected/unselected from a specified input
- * sequence.
- *
- * @par Usage Considerations
- * \cdp_class{DevicePartition}
- *
- * @par Performance
- * \linear_performance{partition}
- *
- * @par
- * The following chart illustrates DevicePartition::If
- * performance across different CUDA architectures for @p int32 items,
- * where 50% of the items are randomly selected for the first partition.
- * \plots_below
- *
- * @image html partition_if_int32_50_percent.png
- *
- */
+//! @rst
+//! DevicePartition provides device-wide, parallel operations for
+//! partitioning sequences of data items residing within device-accessible memory.
+//! 
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! These operations apply a selection criterion to construct a partitioned
+//! output sequence from items selected/unselected from a specified input
+//! sequence.
+//! 
+//! Usage Considerations
+//! ++++++++++++++++++++++++++
+//!
+//! @cdp_class{DevicePartition}
+//! 
+//! Performance
+//! ++++++++++++++++++++++++++
+//!
+//! @linear_performance{partition}
+//! 
+//! @endrst
 struct DevicePartition
 {
-    /**
-     * @brief Uses the @p d_flags sequence to split the corresponding items from
-     *        @p d_in into a partitioned sequence @p d_out. The total number of
-     *        items copied into the first partition is written to
-     *        @p d_num_selected_out. ![](partition_flags_logo.png)
-     *
-     * @par
-     * - The value type of @p d_flags must be castable to @p bool (e.g.,
-     *   @p bool, @p char, @p int, etc.).
-     * - Copies of the selected items are compacted into @p d_out and maintain
-     *   their original relative ordering, however copies of the unselected
-     *   items are compacted into the rear of @p d_out in reverse order.
-     * - The range `[d_out, d_out + num_items)` shall not overlap
-     *   `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any
-     *   way. The range `[d_in, d_in + num_items)` may overlap
-     *  `[d_flags, d_flags + num_items)`.
-     * - \devicestorage
-     *
-     * @par Snippet
-     * The code snippet below illustrates the compaction of items selected from
-     * an @p int device vector.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for
-     * // input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void *d_temp_storage = nullptr;
-     * std::size_t temp_storage_bytes = 0;
-     * cub::DevicePartition::Flagged(
-     *   d_temp_storage, temp_storage_bytes,
-     *   d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DevicePartition::Flagged(
-     *   d_temp_storage, temp_storage_bytes,
-     *   d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
-     * // d_num_selected_out    <-- [4]
-     * @endcode
-     *
-     * @tparam InputIteratorT
-     *   **[inferred]** Random-access input iterator type for reading
-     *   input items \iterator
-     *
-     * @tparam FlagIterator
-     *   **[inferred]** Random-access input iterator type for reading
-     *   selection flags \iterator
-     *
-     * @tparam OutputIteratorT
-     *   **[inferred]** Random-access output iterator type for writing
-     *   output items \iterator
-     *
-     * @tparam NumSelectedIteratorT
-     *   **[inferred]** Output iterator type for recording the number
-     *   of items selected \iterator
-     *
-     * @param[in] d_temp_storage
-     *   Device-accessible allocation of temporary storage. When `nullptr`, the
-     *   required allocation size is written to @p temp_storage_bytes and no
-     *   work is done.
-     *
-     * @param[in,out] temp_storage_bytes
-     *   Reference to size in bytes of @p d_temp_storage allocation
-     *
-     * @param[in] d_in
-     *   Pointer to the input sequence of data items
-     *
-     * @param[in] d_flags
-     *   Pointer to the input sequence of selection flags
-     *
-     * @param[out] d_out
-     *   Pointer to the output sequence of partitioned data items
-     *
-     * @param[out] d_num_selected_out
-     *   Pointer to the output total number of items selected (i.e., the
-     *   offset of the unselected partition)
-     *
-     * @param[in] num_items
-     *   Total number of items to select from
-     *
-     * @param[in] stream
-     *   **[optional]** CUDA stream to launch kernels within.
-     *   Default is stream<sub>0</sub>.
-     */
+    //! @rst
+    //! Uses the ``d_flags`` sequence to split the corresponding items from
+    //! ``d_in`` into a partitioned sequence ``d_out``. 
+    //! The total number of items copied into the first partition is written to ``d_num_selected_out``.
+    //! 
+    //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.).
+    //! - Copies of the selected items are compacted into ``d_out`` and maintain
+    //!   their original relative ordering, however copies of the unselected
+    //!   items are compacted into the rear of ``d_out`` in reverse order.
+    //! - The range ``[d_out, d_out + num_items)`` shall not overlap
+    //!   ``[d_in, d_in + num_items)`` nor ``[d_flags, d_flags + num_items)`` in any way. 
+    //!   The range ``[d_in, d_in + num_items)`` may overlap ``[d_flags, d_flags + num_items)``.
+    //! - @devicestorage
+    //! 
+    //! Snippet
+    //! ++++++++++++++++++++++++++
+    //!
+    //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/device/device_partition.cuh>
+    //! 
+    //!    // Declare, allocate, and initialize device-accessible pointers for
+    //!    // input, flags, and output
+    //!    int  num_items;              // e.g., 8
+    //!    int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+    //!    char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+    //!    int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+    //!    int  *d_num_selected_out;    // e.g., [ ]
+    //!    ...
+    //! 
+    //!    // Determine temporary device storage requirements
+    //!    void *d_temp_storage = nullptr;
+    //!    std::size_t temp_storage_bytes = 0;
+    //!    cub::DevicePartition::Flagged(
+    //!      d_temp_storage, temp_storage_bytes,
+    //!      d_in, d_flags, d_out, d_num_selected_out, num_items);
+    //! 
+    //!    // Allocate temporary storage
+    //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+    //! 
+    //!    // Run selection
+    //!    cub::DevicePartition::Flagged(
+    //!      d_temp_storage, temp_storage_bytes,
+    //!      d_in, d_flags, d_out, d_num_selected_out, num_items);
+    //! 
+    //!    // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
+    //!    // d_num_selected_out    <-- [4]
+    //!
+    //! @endrst
+    //! 
+    //! @tparam InputIteratorT
+    //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+    //! 
+    //! @tparam FlagIterator
+    //!   **[inferred]** Random-access input iterator type for reading selection flags @iterator
+    //! 
+    //! @tparam OutputIteratorT
+    //!   **[inferred]** Random-access output iterator type for writing output items @iterator
+    //! 
+    //! @tparam NumSelectedIteratorT
+    //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+    //! 
+    //! @param[in] d_temp_storage
+    //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+    //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+    //! 
+    //! @param[in,out] temp_storage_bytes
+    //!   Reference to size in bytes of `d_temp_storage` allocation
+    //! 
+    //! @param[in] d_in
+    //!   Pointer to the input sequence of data items
+    //! 
+    //! @param[in] d_flags
+    //!   Pointer to the input sequence of selection flags
+    //! 
+    //! @param[out] d_out
+    //!   Pointer to the output sequence of partitioned data items
+    //! 
+    //! @param[out] d_num_selected_out
+    //!   Pointer to the output total number of items selected (i.e., the
+    //!   offset of the unselected partition)
+    //! 
+    //! @param[in] num_items
+    //!   Total number of items to select from
+    //! 
+    //! @param[in] stream
+    //!   @rst
+    //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+    //!   @endrst
     template <typename InputIteratorT,
               typename FlagIterator,
               typename OutputIteratorT,
@@ -250,132 +237,110 @@ struct DevicePartition
     }
 
 
-    /**
-     * @brief Uses the @p select_op functor to split the corresponding items
-     *        from @p d_in into a partitioned sequence @p d_out. The total
-     *        number of items copied into the first partition is written to
-     *        @p d_num_selected_out. ![](partition_logo.png)
-     *
-     * @par
-     * - Copies of the selected items are compacted into @p d_out and maintain
-     *   their original relative ordering, however copies of the unselected
-     *   items are compacted into the rear of @p d_out in reverse order.
-     * - The range `[d_out, d_out + num_items)` shall not overlap
-     *   `[d_in, d_in + num_items)` in any way.
-     * - \devicestorage
-     *
-     * @par Performance
-     * The following charts illustrate saturated partition-if performance across
-     * different CUDA architectures for @p int32 and @p int64 items,
-     * respectively. Items are selected for the first partition with 50%
-     * probability.
-     *
-     * @image html partition_if_int32_50_percent.png
-     * @image html partition_if_int64_50_percent.png
-     *
-     * @par
-     * The following charts are similar, but 5% selection probability for the
-     * first partition:
-     *
-     * @image html partition_if_int32_5_percent.png
-     * @image html partition_if_int64_5_percent.png
-     *
-     * @par Snippet
-     * The code snippet below illustrates the compaction of items selected from
-     * an @p int device vector.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     explicit LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const
-     *     {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for
-     * // input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void *d_temp_storage = nullptr;
-     * std::size_t temp_storage_bytes = 0;
-     * cub::DevicePartition::If(
-     * d_temp_storage, temp_storage_bytes,
-     * d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DevicePartition::If(
-     *   d_temp_storage, temp_storage_bytes,
-     *   d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
-     * // d_num_selected_out    <-- [5]
-     *
-     * @endcode
-     *
-     * @tparam InputIteratorT
-     *   **[inferred]** Random-access input iterator type for reading input
-     *   items \iterator
-     *
-     * @tparam OutputIteratorT
-     *   **[inferred]** Random-access output iterator type for writing output
-     *   items \iterator
-     *
-     * @tparam NumSelectedIteratorT
-     *   **[inferred]** Output iterator type for recording the number of items
-     *   selected \iterator
-     *
-     * @tparam SelectOp
-     *   **[inferred]** Selection functor type having member
-     *   `bool operator()(const T &a)`
-     *
-     * @param[in] d_temp_storage
-     *   Device-accessible allocation of temporary storage. When `nullptr`, the
-     *   required allocation size is written to `temp_storage_bytes` and no
-     *   work is done.
-     *
-     * @param[in,out] temp_storage_bytes
-     *   Reference to size in bytes of @p d_temp_storage allocation
-     *
-     * @param[in] d_in
-     *   Pointer to the input sequence of data items
-     *
-     * @param[out] d_out
-     *   Pointer to the output sequence of partitioned data items
-     *
-     * @param[out] d_num_selected_out
-     *   Pointer to the output total number of items selected (i.e., the
-     *   offset of the unselected partition)
-     *
-     * @param[in] num_items
-     *   Total number of items to select from
-     *
-     * @param[in] select_op
-     *   Unary selection operator
-     *
-     * @param[in] stream
-     *   **[optional]** CUDA stream to launch kernels within.
-     *   Default is stream<sub>0</sub>.
-     */
+    //! @rst
+    //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into 
+    //! a partitioned sequence ``d_out``. The total number of items copied into the first partition is written 
+    //! to ``d_num_selected_out``.
+    //! 
+    //! - Copies of the selected items are compacted into ``d_out`` and maintain
+    //!   their original relative ordering, however copies of the unselected
+    //!   items are compacted into the rear of ``d_out`` in reverse order.
+    //! - The range ``[d_out, d_out + num_items)`` shall not overlap
+    //!   ``[d_in, d_in + num_items)`` in any way.
+    //! - @devicestorage
+    //! 
+    //! Snippet
+    //! +++++++++++++++++++++++++++++++++++++++++++++
+    //! 
+    //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/device/device_partition.cuh>
+    //! 
+    //!    // Functor type for selecting values less than some criteria
+    //!    struct LessThan
+    //!    {
+    //!        int compare;
+    //! 
+    //!        CUB_RUNTIME_FUNCTION __forceinline__
+    //!        explicit LessThan(int compare) : compare(compare) {}
+    //! 
+    //!        CUB_RUNTIME_FUNCTION __forceinline__
+    //!        bool operator()(const int &a) const
+    //!        {
+    //!            return (a < compare);
+    //!        }
+    //!    };
+    //! 
+    //!    // Declare, allocate, and initialize device-accessible pointers for
+    //!    // input and output
+    //!    int      num_items;              // e.g., 8
+    //!    int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+    //!    int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+    //!    int      *d_num_selected_out;    // e.g., [ ]
+    //!    LessThan select_op(7);
+    //!    ...
+    //! 
+    //!    // Determine temporary device storage requirements
+    //!    void *d_temp_storage = nullptr;
+    //!    std::size_t temp_storage_bytes = 0;
+    //!    cub::DevicePartition::If(
+    //!    d_temp_storage, temp_storage_bytes,
+    //!    d_in, d_out, d_num_selected_out, num_items, select_op);
+    //! 
+    //!    // Allocate temporary storage
+    //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+    //! 
+    //!    // Run selection
+    //!    cub::DevicePartition::If(
+    //!      d_temp_storage, temp_storage_bytes,
+    //!      d_in, d_out, d_num_selected_out, num_items, select_op);
+    //! 
+    //!    // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
+    //!    // d_num_selected_out    <-- [5]
+    //! 
+    //! @endrst
+    //! 
+    //! @tparam InputIteratorT
+    //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+    //! 
+    //! @tparam OutputIteratorT
+    //!   **[inferred]** Random-access output iterator type for writing output items @iterator
+    //! 
+    //! @tparam NumSelectedIteratorT
+    //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+    //! 
+    //! @tparam SelectOp
+    //!   **[inferred]** Selection functor type having member `bool operator()(const T &a)`
+    //! 
+    //! @param[in] d_temp_storage
+    //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+    //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+    //! 
+    //! @param[in,out] temp_storage_bytes
+    //!   Reference to size in bytes of ``d_temp_storage`` allocation
+    //! 
+    //! @param[in] d_in
+    //!   Pointer to the input sequence of data items
+    //! 
+    //! @param[out] d_out
+    //!   Pointer to the output sequence of partitioned data items
+    //! 
+    //! @param[out] d_num_selected_out
+    //!   Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+    //! 
+    //! @param[in] num_items
+    //!   Total number of items to select from
+    //! 
+    //! @param[in] select_op
+    //!   Unary selection operator
+    //! 
+    //! @param[in] stream
+    //!   @rst
+    //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+    //!   @endrst
     template <typename InputIteratorT,
               typename OutputIteratorT,
               typename NumSelectedIteratorT,
@@ -444,190 +409,185 @@ struct DevicePartition
         stream);
     }
 
-    /**
-     * @brief Uses two functors to split the corresponding items from @p d_in
-     *        into a three partitioned sequences @p d_first_part_out
-     *        @p d_second_part_out and @p d_unselected_out.
-     *        The total number of items copied into the first partition is written
-     *        to `d_num_selected_out[0]`, while the total number of items copied
-     *        into the second partition is written to `d_num_selected_out[1]`.
-     *
-     * @par
-     * - Copies of the items selected by @p select_first_part_op are compacted
-     *   into @p d_first_part_out and maintain their original relative ordering.
-     * - Copies of the items selected by @p select_second_part_op are compacted
-     *   into @p d_second_part_out and maintain their original relative ordering.
-     * - Copies of the unselected items are compacted into the
-     *   @p d_unselected_out in reverse order.
-     * - The ranges `[d_out, d_out + num_items)`,
-     *   `[d_first_part_out, d_first_part_out + d_num_selected_out[0])`,
-     *   `[d_second_part_out, d_second_part_out + d_num_selected_out[1])`,
-     *   `[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])`,
-     *   shall not overlap in any way.
-     *
-     * @par Snippet
-     * The code snippet below illustrates how this algorithm can partition an
-     * input vector into small, medium, and large items so that the relative
-     * order of items remain deterministic.
-     *
-     * Let's consider any value that doesn't exceed six a small one. On the
-     * other hand, any value that exceeds 50 will be considered a large one.
-     * Since the value used to define a small part doesn't match one that
-     * defines the large part, the intermediate segment is implied.
-     *
-     * These definitions partition a value space into three categories. We want
-     * to preserve the order of items in which they appear in the input vector.
-     * Since the algorithm provides stable partitioning, this is possible.
-     *
-     * Since the number of items in each category is unknown beforehand, we need
-     * three output arrays of num_items elements each. To reduce the memory
-     * requirements, we can combine the output storage for two categories.
-     *
-     * Since each value falls precisely in one category, it's safe to add
-     * "large" values into the head of the shared output vector and the "middle"
-     * values into its tail. To add items into the tail of the output array, we
-     * can use `thrust::reverse_iterator`.
-     * @par
-     * @code
-     * #include <cub/cub.cuh>
-     * // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     explicit LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const
-     *     {
-     *         return a < compare;
-     *     }
-     * };
-     *
-     * // Functor type for selecting values greater than some criteria
-     * struct GreaterThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     explicit GreaterThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const
-     *     {
-     *         return a > compare;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for
-     * // input and output
-     * int      num_items;                   // e.g., 8
-     * int      *d_in;                       // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_large_and_unselected_out; // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_small_out;                // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;         // e.g., [ , ]
-     * thrust::reverse_iterator<T> unselected_out(d_large_and_unselected_out + num_items);
-     * LessThan small_items_selector(7);
-     * GreaterThan large_items_selector(50);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void *d_temp_storage = nullptr;
-     * std::size_t temp_storage_bytes = 0;
-     * cub::DevicePartition::If(
-     *      d_temp_storage, temp_storage_bytes,
-     *      d_in, d_large_and_medium_out, d_small_out, unselected_out,
-     *      d_num_selected_out, num_items,
-     *      large_items_selector, small_items_selector);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DevicePartition::If(
-     *      d_temp_storage, temp_storage_bytes,
-     *      d_in, d_large_and_medium_out, d_small_out, unselected_out,
-     *      d_num_selected_out, num_items,
-     *      large_items_selector, small_items_selector);
-     *
-     * // d_large_and_unselected_out  <-- [ 81,  ,  ,  ,  ,  , 8, 9 ]
-     * // d_small_out                 <-- [  0, 2, 3, 5, 2,  ,  ,   ]
-     * // d_num_selected_out          <-- [  1, 5 ]
-     * @endcode
-     *
-     * @tparam InputIteratorT
-     *   **[inferred]** Random-access input iterator type for reading
-     *   input items \iterator
-     *
-     * @tparam FirstOutputIteratorT
-     *   **[inferred]** Random-access output iterator type for writing output
-     *   items selected by first operator \iterator
-     *
-     * @tparam SecondOutputIteratorT
-     *   **[inferred]** Random-access output iterator type for writing output
-     *   items selected by second operator \iterator
-     *
-     * @tparam UnselectedOutputIteratorT
-     *   **[inferred]** Random-access output iterator type for writing
-     *   unselected items \iterator
-     *
-     * @tparam NumSelectedIteratorT
-     *   **[inferred]** Output iterator type for recording the number of items
-     *   selected \iterator
-     *
-     * @tparam SelectFirstPartOp
-     *   **[inferred]** Selection functor type having member
-     *   `bool operator()(const T &a)`
-     *
-     * @tparam SelectSecondPartOp
-     *   **[inferred]** Selection functor type having member
-     *   `bool operator()(const T &a)`
-     *
-     * @param[in] d_temp_storage
-     *   Device-accessible allocation of temporary storage. When `nullptr`, the
-     *   required allocation size is written to @p temp_storage_bytes and
-     *   no work is done.
-     *
-     * @param[in,out] temp_storage_bytes
-     *   Reference to size in bytes of @p d_temp_storage allocation
-     *
-     * @param[in] d_in
-     *   Pointer to the input sequence of data items
-     *
-     * @param[out] d_first_part_out
-     *   Pointer to the output sequence of data items selected by
-     *   @p select_first_part_op
-     *
-     * @param[out] d_second_part_out
-     *   Pointer to the output sequence of data items selected by
-     *   @p select_second_part_op
-     *
-     * @param[out] d_unselected_out
-     *   Pointer to the output sequence of unselected data items
-     *
-     * @param[out] d_num_selected_out
-     *   Pointer to the output array with two elements, where total number of
-     *   items selected by @p select_first_part_op is stored as
-     *   `d_num_selected_out[0]` and total number of items selected by
-     *   @p select_second_part_op is stored as `d_num_selected_out[1]`,
-     *   respectively
-     *
-     * @param[in] num_items
-     *   Total number of items to select from
-     *
-     * @param[in] select_first_part_op
-     *   Unary selection operator to select @p d_first_part_out
-     *
-     * @param[in] select_second_part_op
-     *   Unary selection operator to select @p d_second_part_out
-     *
-     * @param[in] stream
-     *   **[optional]** CUDA stream to launch kernels within.
-     *   Default is stream<sub>0</sub>.
-     */
+    //! @rst
+    //! Uses two functors to split the corresponding items from ``d_in`` into a three partitioned sequences 
+    //! ``d_first_part_out``, ``d_second_part_out``, and ``d_unselected_out``.
+    //! The total number of items copied into the first partition is written
+    //! to ``d_num_selected_out[0]``, while the total number of items copied into the second partition is written 
+    //! to ``d_num_selected_out[1]``.
+    //! 
+    //! - Copies of the items selected by ``select_first_part_op`` are compacted
+    //!   into ``d_first_part_out`` and maintain their original relative ordering.
+    //! - Copies of the items selected by ``select_second_part_op`` are compacted
+    //!   into ``d_second_part_out`` and maintain their original relative ordering.
+    //! - Copies of the unselected items are compacted into the ``d_unselected_out`` in reverse order.
+    //! - The ranges ``[d_out, d_out + num_items)``,
+    //!   ``[d_first_part_out, d_first_part_out + d_num_selected_out[0])``,
+    //!   ``[d_second_part_out, d_second_part_out + d_num_selected_out[1])``,
+    //!   ``[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])``,
+    //!   shall not overlap in any way.
+    //! 
+    //! Snippet
+    //! +++++++++++++++++++++++++++++++++++++++++++++
+    //!
+    //! The code snippet below illustrates how this algorithm can partition an
+    //! input vector into small, medium, and large items so that the relative
+    //! order of items remain deterministic.
+    //! 
+    //! Let's consider any value that doesn't exceed six a small one. On the
+    //! other hand, any value that exceeds 50 will be considered a large one.
+    //! Since the value used to define a small part doesn't match one that
+    //! defines the large part, the intermediate segment is implied.
+    //! 
+    //! These definitions partition a value space into three categories. We want
+    //! to preserve the order of items in which they appear in the input vector.
+    //! Since the algorithm provides stable partitioning, this is possible.
+    //! 
+    //! Since the number of items in each category is unknown beforehand, we need
+    //! three output arrays of num_items elements each. To reduce the memory
+    //! requirements, we can combine the output storage for two categories.
+    //! 
+    //! Since each value falls precisely in one category, it's safe to add
+    //! "large" values into the head of the shared output vector and the "middle"
+    //! values into its tail. To add items into the tail of the output array, we
+    //! can use ``thrust::reverse_iterator``.
+    //!
+    //! .. code-block:: c++
+    //!
+    //!    #include <cub/cub.cuh>
+    //!    // or equivalently <cub/device/device_partition.cuh>
+    //! 
+    //!    // Functor type for selecting values less than some criteria
+    //!    struct LessThan
+    //!    {
+    //!        int compare;
+    //! 
+    //!        CUB_RUNTIME_FUNCTION __forceinline__
+    //!        explicit LessThan(int compare) : compare(compare) {}
+    //! 
+    //!        CUB_RUNTIME_FUNCTION __forceinline__
+    //!        bool operator()(const int &a) const
+    //!        {
+    //!            return a < compare;
+    //!        }
+    //!    };
+    //! 
+    //!    // Functor type for selecting values greater than some criteria
+    //!    struct GreaterThan
+    //!    {
+    //!        int compare;
+    //! 
+    //!        CUB_RUNTIME_FUNCTION __forceinline__
+    //!        explicit GreaterThan(int compare) : compare(compare) {}
+    //! 
+    //!        CUB_RUNTIME_FUNCTION __forceinline__
+    //!        bool operator()(const int &a) const
+    //!        {
+    //!            return a > compare;
+    //!        }
+    //!    };
+    //! 
+    //!    // Declare, allocate, and initialize device-accessible pointers for
+    //!    // input and output
+    //!    int      num_items;                   // e.g., 8
+    //!    int      *d_in;                       // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+    //!    int      *d_large_and_unselected_out; // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+    //!    int      *d_small_out;                // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+    //!    int      *d_num_selected_out;         // e.g., [ , ]
+    //!    thrust::reverse_iterator<T> unselected_out(d_large_and_unselected_out + num_items);
+    //!    LessThan small_items_selector(7);
+    //!    GreaterThan large_items_selector(50);
+    //!    ...
+    //! 
+    //!    // Determine temporary device storage requirements
+    //!    void *d_temp_storage = nullptr;
+    //!    std::size_t temp_storage_bytes = 0;
+    //!    cub::DevicePartition::If(
+    //!         d_temp_storage, temp_storage_bytes,
+    //!         d_in, d_large_and_medium_out, d_small_out, unselected_out,
+    //!         d_num_selected_out, num_items,
+    //!         large_items_selector, small_items_selector);
+    //! 
+    //!    // Allocate temporary storage
+    //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+    //! 
+    //!    // Run selection
+    //!    cub::DevicePartition::If(
+    //!         d_temp_storage, temp_storage_bytes,
+    //!         d_in, d_large_and_medium_out, d_small_out, unselected_out,
+    //!         d_num_selected_out, num_items,
+    //!         large_items_selector, small_items_selector);
+    //! 
+    //!    // d_large_and_unselected_out  <-- [ 81,  ,  ,  ,  ,  , 8, 9 ]
+    //!    // d_small_out                 <-- [  0, 2, 3, 5, 2,  ,  ,   ]
+    //!    // d_num_selected_out          <-- [  1, 5 ]
+    //!
+    //! @endrst
+    //! 
+    //! @tparam InputIteratorT
+    //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+    //! 
+    //! @tparam FirstOutputIteratorT
+    //!   **[inferred]** Random-access output iterator type for writing output
+    //!   items selected by first operator @iterator
+    //! 
+    //! @tparam SecondOutputIteratorT
+    //!   **[inferred]** Random-access output iterator type for writing output
+    //!   items selected by second operator @iterator
+    //! 
+    //! @tparam UnselectedOutputIteratorT
+    //!   **[inferred]** Random-access output iterator type for writing
+    //!   unselected items @iterator
+    //! 
+    //! @tparam NumSelectedIteratorT
+    //!   **[inferred]** Output iterator type for recording the number of items
+    //!   selected @iterator
+    //! 
+    //! @tparam SelectFirstPartOp
+    //!   **[inferred]** Selection functor type having member `bool operator()(const T &a)`
+    //! 
+    //! @tparam SelectSecondPartOp
+    //!   **[inferred]** Selection functor type having member `bool operator()(const T &a)`
+    //! 
+    //! @param[in] d_temp_storage
+    //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+    //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+    //! 
+    //! @param[in,out] temp_storage_bytes
+    //!   Reference to size in bytes of `d_temp_storage` allocation
+    //! 
+    //! @param[in] d_in
+    //!   Pointer to the input sequence of data items
+    //! 
+    //! @param[out] d_first_part_out
+    //!   Pointer to the output sequence of data items selected by `select_first_part_op`
+    //! 
+    //! @param[out] d_second_part_out
+    //!   Pointer to the output sequence of data items selected by `select_second_part_op`
+    //! 
+    //! @param[out] d_unselected_out
+    //!   Pointer to the output sequence of unselected data items
+    //! 
+    //! @param[out] d_num_selected_out
+    //!   Pointer to the output array with two elements, where total number of
+    //!   items selected by `select_first_part_op` is stored as
+    //!   `d_num_selected_out[0]` and total number of items selected by
+    //!   `select_second_part_op` is stored as `d_num_selected_out[1]`,
+    //!   respectively
+    //! 
+    //! @param[in] num_items
+    //!   Total number of items to select from
+    //! 
+    //! @param[in] select_first_part_op
+    //!   Unary selection operator to select `d_first_part_out`
+    //! 
+    //! @param[in] select_second_part_op
+    //!   Unary selection operator to select `d_second_part_out`
+    //! 
+    //! @param[in] stream
+    //!   @rst
+    //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+    //!   @endrst
     template <typename InputIteratorT,
               typename FirstOutputIteratorT,
               typename SecondOutputIteratorT,
@@ -716,11 +676,6 @@ struct DevicePartition
     }
 };
 
-/**
- * @example example_device_partition_flagged.cu
- * @example example_device_partition_if.cu
- */
-
 CUB_NAMESPACE_END
 
 
diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh
index 6506a42282..6a426a3739 100644
--- a/cub/cub/device/device_radix_sort.cuh
+++ b/cub/cub/device/device_radix_sort.cuh
@@ -55,7 +55,6 @@ CUB_NAMESPACE_BEGIN
 //! @brief DeviceRadixSort provides device-wide, parallel operations for
 //!        computing a radix sort across a sequence of data items residing
 //!        within device-accessible memory. ![](sorting_logo.png)
-//! @ingroup SingleModule
 //!
 //! @par Overview
 //! The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort)
@@ -758,7 +757,7 @@ public:
   //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
-  //!   required allocation size is written to @p temp_storage_bytes and no work is done.
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
   //!
   //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
@@ -1191,7 +1190,7 @@ public:
   //!   is done.
   //!
   //! @param[in,out] temp_storage_bytes
-  //!   Reference to size in bytes of @p d_temp_storage allocation
+  //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
   //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
@@ -2597,7 +2596,7 @@ public:
   //!   any of the provided ranges:
   //!
   //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
-  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
   //!
   //! * Upon completion, the sorting operation will update the "current"
   //!   indicator within the DoubleBuffer wrapper to reference which of the two
@@ -2715,7 +2714,7 @@ public:
   //!   any of the provided ranges:
   //!
   //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
-  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
   //!
   //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
   //!   differentiating key bits. This can reduce overall sorting overhead and
@@ -3271,7 +3270,7 @@ public:
   //! Performance is similar to DeviceRadixSort::SortKeys.
   //!
   //! @par Snippet
-  //! The code snippet below illustrates the sorting of a device vector of @p int keys.
+  //! The code snippet below illustrates the sorting of a device vector of `i`nt keys.
   //! @par
   //! @code
   //! #include <cub/cub.cuh>
@@ -3402,7 +3401,7 @@ public:
   //!   any of the provided ranges:
   //!
   //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
-  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
   //!
   //! * Upon completion, the sorting operation will update the "current"
   //!   indicator within the DoubleBuffer wrapper to reference which of the two
@@ -3520,7 +3519,7 @@ public:
   //!   any of the provided ranges:
   //!
   //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
-  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
   //!
   //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
   //!   differentiating key bits. This can reduce overall sorting overhead and
@@ -3642,11 +3641,7 @@ public:
                                                              stream);
   }
 
-  //@}  end member group
+  //! @}  end member group
 };
 
-/**
- * @example example_device_radix_sort.cu
- */
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index bef701684d..715c9556bc 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -26,11 +26,9 @@
  *
  ******************************************************************************/
 
-/**
- * @file cub::DeviceReduce provides device-wide, parallel operations for
- *       computing a reduction across a sequence of data items residing within
- *       device-accessible memory.
- */
+//! @file cub::DeviceReduce provides device-wide, parallel operations for
+//!       computing a reduction across a sequence of data items residing within
+//!       device-accessible memory.
 
 #pragma once
 
@@ -56,8 +54,6 @@
 CUB_NAMESPACE_BEGIN
 
 
-//! @ingroup SingleModule
-//!
 //! @rst
 //! DeviceReduce provides device-wide, parallel operations for computing
 //! a reduction across a sequence of data items residing within
@@ -68,140 +64,127 @@ CUB_NAMESPACE_BEGIN
 //!
 //! Overview
 //! ====================================
+//!
 //! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
 //! (or *fold*) uses a binary combining operator to compute a single aggregate
 //! from a sequence of input elements.
 //!
 //! Usage Considerations
 //! ====================================
+//!
 //! @cdp_class{DeviceReduce}
 //!
 //! Performance
 //! ====================================
-//! @linear_performance{reduction, reduce-by-key, and run-length encode}
-//!
-//! The following chart illustrates DeviceReduce::Sum
-//! performance across different CUDA architectures for \p int32 keys.
-//!
-//! .. image:: ../img/reduce_int32.png
-//!     :align: center
-//!
-//! @par
-//! The following chart illustrates DeviceReduce::ReduceByKey (summation)
-//! performance across different CUDA architectures for `fp32` values. Segments
-//! are identified by `int32` keys, and have lengths uniformly sampled
-//! from `[1, 1000]`.
 //!
-//! .. image:: ../img/reduce_by_key_fp32_len_500.png
-//!     :align: center
+//! @linear_performance{reduction, reduce-by-key, and run-length encode}
 //!
 //! @endrst
 struct DeviceReduce
 {
-  /**
-   * @brief Computes a device-wide reduction using the specified binary
-   *        `reduction_op` functor and initial value `init`.
-   *
-   * @par
-   * - Does not support binary reduction operators that are non-commutative.
-   * - Provides "run-to-run" determinism for pseudo-associative reduction
-   *   (e.g., addition of floating point types) on the same GPU device.
-   *   However, results for pseudo-associative reduction may be inconsistent
-   *   from one device to a another device of a different compute-capability
-   *   because CUB can employ different tile-sizing for different architectures.
-   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates a user-defined min-reduction of a
-   * device vector of `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     __device__ __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;  // e.g., 7
-   * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int          *d_out;     // e.g., [-]
-   * CustomMin    min_op;
-   * int          init;       // e.g., INT_MAX
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceReduce::Reduce(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, num_items, min_op, init);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run reduction
-   * cub::DeviceReduce::Reduce(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, num_items, min_op, init);
-   *
-   * // d_out <-- [0]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced
-   *   aggregate \iterator
-   *
-   * @tparam ReductionOpT
-   *   **[inferred]** Binary reduction functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam T
-   *   **[inferred]** Data element type that is convertible to the `value` type
-   *   of `InputIteratorT`
-   *
-   * @tparam NumItemsT **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param d_in[in]
-   *   Pointer to the input sequence of data items
-   *
-   * @param d_out[out]
-   *   Pointer to the output aggregate
-   *
-   * @param num_items[in]
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param reduction_op[in]
-   *   Binary reduction functor
-   *
-   * @param[in] init
-   *   Initial value of the reduction
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
+  //! 
+  //! - Does not support binary reduction operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates a user-defined min-reduction of a
+  //! device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_radix_sort.cuh>
+  //!    
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!    
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;  // e.g., 7
+  //!    int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_out;     // e.g., [-]
+  //!    CustomMin    min_op;
+  //!    int          init;       // e.g., INT_MAX
+  //!    ...
+  //!    
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::Reduce(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items, min_op, init);
+  //!    
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!    
+  //!    // Run reduction
+  //!    cub::DeviceReduce::Reduce(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items, min_op, init);
+  //!    
+  //!    // d_out <-- [0]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //! 
+  //! @tparam ReductionOpT
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam T
+  //!   **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of num_items
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //! 
+  //! @param[in] init
+  //!   Initial value of the reduction
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename ReductionOpT,
@@ -261,89 +244,81 @@ struct DeviceReduce
       stream);
   }
 
-  /**
-   * @brief Computes a device-wide sum using the addition (`+`) operator.
-   *
-   * @par
-   * - Uses `0` as the initial value of the reduction.
-   * - Does not support \p + operators that are non-commutative..
-   * - Provides "run-to-run" determinism for pseudo-associative reduction
-   *   (e.g., addition of floating point types) on the same GPU device.
-   *   However, results for pseudo-associative reduction may be inconsistent
-   *   from one device to a another device of a different compute-capability
-   *   because CUB can employ different tile-sizing for different architectures.
-   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
-   * - @devicestorage
-   *
-   * @par Performance
-   * The following charts illustrate saturated sum-reduction performance across
-   * different CUDA architectures for `int32` and `int64` items, respectively.
-   *
-   * @image html reduce_int32.png
-   * @image html reduce_int64.png
-   *
-   * @par Snippet
-   * The code snippet below illustrates the sum-reduction of a device vector
-   * of `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int  num_items;      // e.g., 7
-   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_out;         // e.g., [-]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceReduce::Sum(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sum-reduction
-   * cub::DeviceReduce::Sum(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-   *
-   * // d_out <-- [38]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced
-   *   aggregate \iterator
-   *
-   * @tparam NumItemsT **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes a device-wide sum using the addition (``+``) operator.
+  //! 
+  //! - Uses ``0`` as the initial value of the reduction.
+  //! - Does not support ``+`` operators that are non-commutative..
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the sum-reduction of a device vector
+  //! of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_out;         // e.g., [-]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::Sum(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sum-reduction
+  //!    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+  //! 
+  //!    // d_out <-- [38]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of num_items
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename NumItemsT>
@@ -398,82 +373,82 @@ struct DeviceReduce
                                                 stream);
   }
 
-  /**
-   * @brief Computes a device-wide minimum using the less-than ('<') operator.
-   *
-   * @par
-   * - Uses `std::numeric_limits<T>::max()` as the initial value of the reduction.
-   * - Does not support `<` operators that are non-commutative.
-   * - Provides "run-to-run" determinism for pseudo-associative reduction
-   *   (e.g., addition of floating point types) on the same GPU device.
-   *   However, results for pseudo-associative reduction may be inconsistent
-   *   from one device to a another device of a different compute-capability
-   *   because CUB can employ different tile-sizing for different architectures.
-   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the min-reduction of a device vector of
-   * `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int  num_items;      // e.g., 7
-   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_out;         // e.g., [-]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceReduce::Min(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run min-reduction
-   * cub::DeviceReduce::Min(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-   *
-   * // d_out <-- [0]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced
-   *   aggregate \iterator
-   *
-   * @tparam NumItemsT **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes a device-wide minimum using the less-than (``<``) operator.
+  //! 
+  //! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction.
+  //! - Does not support ``<`` operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_out;         // e.g., [-]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::Min(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run min-reduction
+  //!    cub::DeviceReduce::Min(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+  //! 
+  //!    // d_out <-- [0]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of num_items
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename NumItemsT>
@@ -529,86 +504,84 @@ struct DeviceReduce
                                                 stream);
   }
 
-  /**
-   * @brief Finds the first device-wide minimum using the less-than ('<')
-   *        operator, also returning the index of that item.
-   *
-   * @par
-   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>`
-   *   (assuming the value type of `d_in` is `T`)
-   *   - The minimum is written to `d_out.value` and its offset in the input
-   *     array is written to `d_out.key`.
-   *   - The `{1, std::numeric_limits<T>::max()}` tuple is produced for
-   *     zero-length inputs
-   * - Does not support `<` operators that are non-commutative.
-   * - Provides "run-to-run" determinism for pseudo-associative reduction
-   *   (e.g., addition of floating point types) on the same GPU device.
-   *   However, results for pseudo-associative reduction may be inconsistent
-   *   from one device to a another device of a different compute-capability
-   *   because CUB can employ different tile-sizing for different architectures.
-   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the argmin-reduction of a device vector
-   * of `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int                      num_items;      // e.g., 7
-   * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceReduce::ArgMin(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run argmin-reduction
-   * cub::DeviceReduce::ArgMin(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-   *
-   * // d_out <-- [{5, 0}]
-   *
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input items
-   *   (of some type `T`) \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced aggregate
-   *   (having value type `cub::KeyValuePair<int, T>`) \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to \p temp_storage_bytes and no work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
+  //! 
+  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>`` 
+  //!   (assuming the value type of ``d_in`` is ``T``)
+  //!
+  //!   - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
+  //!   - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
+  //!
+  //! - Does not support ``<`` operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap `d_out`.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the argmin-reduction of a device vector
+  //! of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int                      num_items;      // e.g., 7
+  //!    int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run argmin-reduction
+  //!    cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+  //! 
+  //!    // d_out <-- [{5, 0}]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items
+  //!   (of some type `T`) @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate
+  //!   (having value type `cub::KeyValuePair<int, T>`) @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage,
@@ -680,83 +653,79 @@ struct DeviceReduce
                                                    stream);
   }
 
-  /**
-   * @brief Computes a device-wide maximum using the greater-than ('>') operator.
-   *
-   * @par
-   * - Uses `std::numeric_limits<T>::lowest()` as the initial value of the
-   *   reduction.
-   * - Does not support `>` operators that are non-commutative.
-   * - Provides "run-to-run" determinism for pseudo-associative reduction
-   *   (e.g., addition of floating point types) on the same GPU device.
-   *   However, results for pseudo-associative reduction may be inconsistent
-   *   from one device to a another device of a different compute-capability
-   *   because CUB can employ different tile-sizing for different architectures.
-   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the max-reduction of a device vector of
-   * `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int  num_items;      // e.g., 7
-   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_out;         // e.g., [-]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceReduce::Max(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run max-reduction
-   * cub::DeviceReduce::Max(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-   *
-   * // d_out <-- [9]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced
-   *   aggregate \iterator
-   *
-   * @tparam NumItemsT **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes a device-wide maximum using the greater-than (``>``) operator.
+  //! 
+  //! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
+  //! - Does not support ``>`` operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_out;         // e.g., [-]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run max-reduction
+  //!    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+  //! 
+  //!    // d_out <-- [9]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of num_items
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename NumItemsT>
@@ -813,87 +782,88 @@ struct DeviceReduce
                                                 stream);
   }
 
-  /**
-   * @brief Finds the first device-wide maximum using the greater-than ('>')
-   *        operator, also returning the index of that item
-   *
-   * @par
-   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>`
-   *   (assuming the value type of `d_in` is `T`)
-   *   - The maximum is written to `d_out.value` and its offset in the input
-   *     array is written to `d_out.key`.
-   *   - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for
-   *     zero-length inputs
-   * - Does not support `>` operators that are non-commutative.
-   * - Provides "run-to-run" determinism for pseudo-associative reduction
-   *   (e.g., addition of floating point types) on the same GPU device.
-   *   However, results for pseudo-associative reduction may be inconsistent
-   *   from one device to a another device of a different compute-capability
-   *   because CUB can employ different tile-sizing for different architectures.
-   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the argmax-reduction of a device vector
-   * of `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_reduce.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int                      num_items;      // e.g., 7
-   * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceReduce::ArgMax(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run argmax-reduction
-   * cub::DeviceReduce::ArgMax(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-   *
-   * // d_out <-- [{6, 9}]
-   *
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input items
-   *   (of some type \p T) \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced aggregate
-   *   (having value type `cub::KeyValuePair<int, T>`) \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Finds the first device-wide maximum using the greater-than (``>``)
+  //! operator, also returning the index of that item
+  //! 
+  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
+  //!   (assuming the value type of ``d_in`` is ``T``)
+  //!
+  //!   - The maximum is written to ``d_out.value`` and its offset in the input
+  //!     array is written to ``d_out.key``.
+  //!   - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
+  //!
+  //! - Does not support ``>`` operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the argmax-reduction of a device vector
+  //! of `int` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_reduce.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int                      num_items;      // e.g., 7
+  //!    int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::ArgMax(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run argmax-reduction
+  //!    cub::DeviceReduce::ArgMax(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+  //! 
+  //!    // d_out <-- [{6, 9}]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate
+  //!   (having value type `cub::KeyValuePair<int, T>`) @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage,
@@ -966,114 +936,111 @@ struct DeviceReduce
                                                    stream);
   }
 
-  /**
-   * @brief Fuses transform and reduce operations
-   *
-   * @par
-   * - Does not support binary reduction operators that are non-commutative.
-   * - Provides "run-to-run" determinism for pseudo-associative reduction
-   *   (e.g., addition of floating point types) on the same GPU device.
-   *   However, results for pseudo-associative reduction may be inconsistent
-   *   from one device to a another device of a different compute-capability
-   *   because CUB can employ different tile-sizing for different architectures.
-   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates a user-defined min-reduction of a
-   * device vector of `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_reduce.cuh>
-   *
-   * thrust::device_vector<int> in = { 1, 2, 3, 4 };
-   * thrust::device_vector<int> out(1);
-   *
-   * std::size_t temp_storage_bytes = 0;
-   * std::uint8_t *d_temp_storage = nullptr;
-   *
-   * const int init = 42;
-   *
-   * cub::DeviceReduce::TransformReduce(
-   *   d_temp_storage,
-   *   temp_storage_bytes,
-   *   in.begin(),
-   *   out.begin(),
-   *   in.size(),
-   *   cub::Sum{},
-   *   square_t{},
-   *   init);
-   *
-   * thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
-   * d_temp_storage = temp_storage.data().get();
-   *
-   * cub::DeviceReduce::TransformReduce(
-   *   d_temp_storage,
-   *   temp_storage_bytes,
-   *   in.begin(),
-   *   out.begin(),
-   *   in.size(),
-   *   cub::Sum{},
-   *   square_t{},
-   *   init);
-   *
-   * // out[0] <-- 72
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced
-   *   aggregate \iterator
-   *
-   * @tparam ReductionOpT
-   *   **[inferred]** Binary reduction functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam TransformOpT
-   *   **[inferred]** Unary reduction functor type having member
-   *   `auto operator()(const T &a)`
-   *
-   * @tparam T
-   *   **[inferred]** Data element type that is convertible to the `value` type
-   *   of `InputIteratorT`
-   *
-   * @tparam NumItemsT 
-   *   **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param[in] reduction_op
-   *   Binary reduction functor
-   *
-   * @param[in] transform_op
-   *   Unary transform functor
-   *
-   * @param[in] init
-   *   Initial value of the reduction
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Fuses transform and reduce operations
+  //! 
+  //! - Does not support binary reduction operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates a user-defined min-reduction of a
+  //! device vector of `int` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_reduce.cuh>
+  //! 
+  //!    thrust::device_vector<int> in = { 1, 2, 3, 4 };
+  //!    thrust::device_vector<int> out(1);
+  //! 
+  //!    std::size_t temp_storage_bytes = 0;
+  //!    std::uint8_t *d_temp_storage = nullptr;
+  //! 
+  //!    const int init = 42;
+  //! 
+  //!    cub::DeviceReduce::TransformReduce(
+  //!      d_temp_storage,
+  //!      temp_storage_bytes,
+  //!      in.begin(),
+  //!      out.begin(),
+  //!      in.size(),
+  //!      cub::Sum{},
+  //!      square_t{},
+  //!      init);
+  //! 
+  //!    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  //!    d_temp_storage = temp_storage.data().get();
+  //! 
+  //!    cub::DeviceReduce::TransformReduce(
+  //!      d_temp_storage,
+  //!      temp_storage_bytes,
+  //!      in.begin(),
+  //!      out.begin(),
+  //!      in.size(),
+  //!      cub::Sum{},
+  //!      square_t{},
+  //!      init);
+  //! 
+  //!    // out[0] <-- 72
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //! 
+  //! @tparam ReductionOpT
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam TransformOpT
+  //!   **[inferred]** Unary reduction functor type having member `auto operator()(const T &a)`
+  //! 
+  //! @tparam T
+  //!   **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of num_items
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //! 
+  //! @param[in] transform_op
+  //!   Unary transform functor
+  //! 
+  //! @param[in] init
+  //!   Initial value of the reduction
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename ReductionOpT,
@@ -1105,165 +1072,140 @@ struct DeviceReduce
       transform_op);
   }
 
-  /**
-   * @brief Reduces segments of values, where segments are demarcated by
-   *        corresponding runs of identical keys.
-   *
-   * @par
-   * This operation computes segmented reductions within `d_values_in` using
-   * the specified binary `reduction_op` functor. The segments are identified
-   * by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
-   * ranges of consecutive, identical keys. For the *i*<sup>th</sup> run
-   * encountered, the first key of the run and the corresponding value
-   * aggregate of that run are written to `d_unique_out[i] and
-   * `d_aggregates_out[i]`, respectively. The total number of runs encountered
-   * is written to `d_num_runs_out`.
-   *
-   * @par
-   * - The `==` equality operator is used to determine whether keys are
-   *   equivalent
-   * - Provides "run-to-run" determinism for pseudo-associative reduction
-   *   (e.g., addition of floating point types) on the same GPU device.
-   *   However, results for pseudo-associative reduction may be inconsistent
-   *   from one device to a another device of a different compute-capability
-   *   because CUB can employ different tile-sizing for different architectures.
-   * - Let `out` be any of
-   *   `[d_unique_out, d_unique_out + *d_num_runs_out)`
-   *   `[d_aggregates_out, d_aggregates_out + *d_num_runs_out)`
-   *   `d_num_runs_out`. The ranges represented by `out` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`,
-   *   `[d_values_in, d_values_in + num_items)` nor `out` in any way.
-   * - @devicestorage
-   *
-   * @par Performance
-   * The following chart illustrates reduction-by-key (sum) performance across
-   * different CUDA architectures for `fp32` and `fp64` values, respectively.
-   * Segments are identified by `int32` keys, and have lengths uniformly
-   * sampled from `[1, 1000]`.
-   *
-   * @image html reduce_by_key_fp32_len_500.png
-   * @image html reduce_by_key_fp64_len_500.png
-   *
-   * @par
-   * The following charts are similar, but with segment lengths uniformly
-   * sampled from [1,10]:
-   *
-   * @image html reduce_by_key_fp32_len_5.png
-   * @image html reduce_by_key_fp64_len_5.png
-   *
-   * @par Snippet
-   * The code snippet below illustrates the segmented reduction of `int` values
-   * grouped by runs of associated `int` keys.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_reduce.cuh>
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int          num_items;          // e.g., 8
-   * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-   * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
-   * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
-   * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
-   * int          *d_num_runs_out;    // e.g., [-]
-   * CustomMin    reduction_op;
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceReduce::ReduceByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_unique_out, d_values_in,
-   *   d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run reduce-by-key
-   * cub::DeviceReduce::ReduceByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_unique_out, d_values_in,
-   *   d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-   *
-   * // d_unique_out      <-- [0, 2, 9, 5, 8]
-   * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
-   * // d_num_runs_out    <-- [5]
-   * @endcode
-   *
-   * @tparam KeysInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   keys \iterator
-   *
-   * @tparam UniqueOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing unique
-   *   output keys \iterator
-   *
-   * @tparam ValuesInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   values \iterator
-   *
-   * @tparam AggregatesOutputIterator
-   *   **[inferred]** Random-access output iterator type for writing output
-   *   value aggregates \iterator
-   *
-   * @tparam NumRunsOutputIteratorT
-   *   **[inferred]** Output iterator type for recording the number of runs
-   *   encountered \iterator
-   *
-   * @tparam ReductionOpT
-   *   **[inferred]*8 Binary reduction functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam NumItemsT **[inferred]** Type of num_items
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_keys_in
-   *   Pointer to the input sequence of keys
-   *
-   * @param[out] d_unique_out
-   *   Pointer to the output sequence of unique keys (one key per run)
-   *
-   * @param[in] d_values_in
-   *   Pointer to the input sequence of corresponding values
-   *
-   * @param[out] d_aggregates_out
-   *   Pointer to the output sequence of value aggregates
-   *   (one aggregate per run)
-   *
-   * @param[out] d_num_runs_out
-   *   Pointer to total number of runs encountered
-   *   (i.e., the length of `d_unique_out`)
-   *
-   * @param[in] reduction_op
-   *   Binary reduction functor
-   *
-   * @param[in] num_items
-   *   Total number of associated key+value pairs
-   *   (i.e., the length of `d_in_keys` and `d_in_values`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+  //! 
+  //! This operation computes segmented reductions within ``d_values_in`` using the specified binary ``reduction_op``
+  //! functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
+  //! ranges of consecutive, identical keys. For the *i*\ :sup:`th` run encountered, the first key of the run and 
+  //! the corresponding value aggregate of that run are written to ``d_unique_out[i]`` and ``d_aggregates_out[i]``,
+  //! respectively. The total number of runs encountered is written to ``d_num_runs_out``.
+  //! 
+  //! - The ``==`` equality operator is used to determine whether keys are equivalent
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - Let ``out`` be any of
+  //!   ``[d_unique_out, d_unique_out + *d_num_runs_out)``
+  //!   ``[d_aggregates_out, d_aggregates_out + *d_num_runs_out)``
+  //!   ``d_num_runs_out``. The ranges represented by ``out`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_values_in, d_values_in + num_items)`` nor ``out`` in any way.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the segmented reduction of ``int`` values grouped by runs of 
+  //! associated ``int`` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_reduce.cuh>
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int          num_items;          // e.g., 8
+  //!    int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+  //!    int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+  //!    int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+  //!    int          *d_num_runs_out;    // e.g., [-]
+  //!    CustomMin    reduction_op;
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::ReduceByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_unique_out, d_values_in,
+  //!      d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run reduce-by-key
+  //!    cub::DeviceReduce::ReduceByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_unique_out, d_values_in,
+  //!      d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+  //! 
+  //!    // d_unique_out      <-- [0, 2, 9, 5, 8]
+  //!    // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+  //!    // d_num_runs_out    <-- [5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //! 
+  //! @tparam UniqueOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing unique output keys @iterator
+  //! 
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //! 
+  //! @tparam AggregatesOutputIterator
+  //!   **[inferred]** Random-access output iterator type for writing output value aggregates @iterator
+  //! 
+  //! @tparam NumRunsOutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of runs encountered @iterator
+  //! 
+  //! @tparam ReductionOpT
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam NumItemsT 
+  //!   **[inferred]** Type of num_items
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Pointer to the input sequence of keys
+  //! 
+  //! @param[out] d_unique_out
+  //!   Pointer to the output sequence of unique keys (one key per run)
+  //! 
+  //! @param[in] d_values_in
+  //!   Pointer to the input sequence of corresponding values
+  //! 
+  //! @param[out] d_aggregates_out
+  //!   Pointer to the output sequence of value aggregates
+  //!   (one aggregate per run)
+  //! 
+  //! @param[out] d_num_runs_out
+  //!   Pointer to total number of runs encountered
+  //!   (i.e., the length of `d_unique_out`)
+  //! 
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //! 
+  //! @param[in] num_items
+  //!   Total number of associated key+value pairs
+  //!   (i.e., the length of `d_in_keys` and `d_in_values`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeysInputIteratorT,
             typename UniqueOutputIteratorT,
             typename ValuesInputIteratorT,
@@ -1353,9 +1295,4 @@ struct DeviceReduce
   }
 };
 
-/**
- * @example example_device_reduce.cu
- */
-
 CUB_NAMESPACE_END
-
diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index 7bef9d3571..7e7c714cb0 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -26,11 +26,9 @@
  *
  ******************************************************************************/
 
-/**
- * @file cub::DeviceRunLengthEncode provides device-wide, parallel operations
- *       for computing a run-length encoding across a sequence of data items
- *       residing within device-accessible memory.
- */
+//! @file cub::DeviceRunLengthEncode provides device-wide, parallel operations
+//!       for computing a run-length encoding across a sequence of data items
+//!       residing within device-accessible memory.
 
 #pragma once
 
@@ -55,152 +53,126 @@
 
 CUB_NAMESPACE_BEGIN
 
-
-/**
- * @brief DeviceRunLengthEncode provides device-wide, parallel operations for
- *        demarcating "runs" of same-valued items within a sequence residing
- *        within device-accessible memory. ![](run_length_encode_logo.png)
- * @ingroup SingleModule
- *
- * @par Overview
- * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding">*run-length encoding*</a>
- * computes a simple compressed representation of a sequence of input elements
- * such that each maximal "run" of consecutive same-valued data items is
- * encoded as a single data value along with a count of the elements in that
- * run.
- *
- * @par Usage Considerations
- * @cdp_class{DeviceRunLengthEncode}
- *
- * @par Performance
- * @linear_performance{run-length encode}
- *
- * @par
- * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode
- * performance across different CUDA architectures for `int32` items.
- * Segments have lengths uniformly sampled from `[1, 1000]`.
- *
- * @image html rle_int32_len_500.png
- *
- * @par
- * @plots_below
- */
+//! @rst
+//! DeviceRunLengthEncode provides device-wide, parallel operations for
+//! demarcating "runs" of same-valued items within a sequence residing
+//! within device-accessible memory.
+//! 
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! A `run-length encoding <http://en.wikipedia.org/wiki/Run-length_encoding>`_
+//! computes a simple compressed representation of a sequence of input elements
+//! such that each maximal "run" of consecutive same-valued data items is
+//! encoded as a single data value along with a count of the elements in that
+//! run.
+//! 
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceRunLengthEncode}
+//! 
+//! Performance
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @linear_performance{run-length encode}
+//!
+//! @endrst
 struct DeviceRunLengthEncode
 {
-  /**
-   * @brief Computes a run-length encoding of the sequence \p d_in.
-   *
-   * @par
-   * - For the *i*<sup>th</sup> run encountered, the first key of the run and
-   *   its length are written to `d_unique_out[i]` and `d_counts_out[i]`,
-   *   respectively.
-   * - The total number of runs encountered is written to `d_num_runs_out`.
-   * - The `==` equality operator is used to determine whether values are
-   *   equivalent
-   * - In-place operations are not supported. There must be no overlap between
-   *   any of the provided ranges:
-   *   - `[d_unique_out, d_unique_out + *d_num_runs_out)`
-   *   - `[d_counts_out, d_counts_out + *d_num_runs_out)`
-   *   - `[d_num_runs_out, d_num_runs_out + 1)`
-   *   - `[d_in, d_in + num_items)`
-   * - @devicestorage
-   *
-   * @par Performance
-   * The following charts illustrate saturated encode performance across
-   * different CUDA architectures for `int32` and `int64` items, respectively.
-   * Segments have lengths uniformly sampled from [1,1000].
-   *
-   * @image html rle_int32_len_500.png
-   * @image html rle_int64_len_500.png
-   *
-   * @par
-   * The following charts are similar, but with segment lengths uniformly
-   * sampled from [1,10]:
-   *
-   * @image html rle_int32_len_5.png
-   * @image html rle_int64_len_5.png
-   *
-   * @par Snippet
-   * The code snippet below illustrates the run-length encoding of a sequence
-   * of `int` values.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_run_length_encode.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;          // e.g., 8
-   * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-   * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-   * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-   * int          *d_num_runs_out;    // e.g., [ ]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceRunLengthEncode::Encode(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run encoding
-   * cub::DeviceRunLengthEncode::Encode(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-   *
-   * // d_unique_out      <-- [0, 2, 9, 5, 8]
-   * // d_counts_out      <-- [1, 2, 1, 3, 1]
-   * // d_num_runs_out    <-- [5]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam UniqueOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing unique
-   *   output items \iterator
-   *
-   * @tparam LengthsOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing output
-   *   counts \iterator
-   *
-   * @tparam NumRunsOutputIteratorT
-   *   **[inferred]** Output iterator type for recording the number of runs
-   *   encountered \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of keys
-   *
-   * @param[out] d_unique_out
-   *   Pointer to the output sequence of unique keys (one key per run)
-   *
-   * @param[out] d_counts_out
-   *   Pointer to the output sequence of run-lengths (one count per run)
-   *
-   * @param[out] d_num_runs_out
-   *   Pointer to total number of runs
-   *
-   * @param[in] num_items
-   *   Total number of associated key+value pairs (i.e., the length of
-   *   `d_in_keys` and `d_in_values`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes a run-length encoding of the sequence ``d_in``.
+  //! 
+  //! - For the *i*\ :sup:`th` run encountered, the first key of the run and
+  //!   its length are written to ``d_unique_out[i]`` and ``d_counts_out[i]``, respectively.
+  //! - The total number of runs encountered is written to ``d_num_runs_out``.
+  //! - The ``==`` equality operator is used to determine whether values are equivalent
+  //! - In-place operations are not supported. There must be no overlap between any of the provided ranges:
+  //!
+  //!   - ``[d_unique_out, d_unique_out + *d_num_runs_out)``
+  //!   - ``[d_counts_out, d_counts_out + *d_num_runs_out)``
+  //!   - ``[d_num_runs_out, d_num_runs_out + 1)``
+  //!   - ``[d_in, d_in + num_items)``
+  //!
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the run-length encoding of a sequence of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_run_length_encode.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;          // e.g., 8
+  //!    int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_num_runs_out;    // e.g., [ ]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceRunLengthEncode::Encode(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run encoding
+  //!    cub::DeviceRunLengthEncode::Encode(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+  //! 
+  //!    // d_unique_out      <-- [0, 2, 9, 5, 8]
+  //!    // d_counts_out      <-- [1, 2, 1, 3, 1]
+  //!    // d_num_runs_out    <-- [5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam UniqueOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing unique output items @iterator
+  //! 
+  //! @tparam LengthsOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output counts @iterator
+  //! 
+  //! @tparam NumRunsOutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of runs encountered @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of keys
+  //! 
+  //! @param[out] d_unique_out
+  //!   Pointer to the output sequence of unique keys (one key per run)
+  //! 
+  //! @param[out] d_counts_out
+  //!   Pointer to the output sequence of run-lengths (one count per run)
+  //! 
+  //! @param[out] d_num_runs_out
+  //!   Pointer to total number of runs
+  //! 
+  //! @param[in] num_items
+  //!   Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename UniqueOutputIteratorT,
             typename LengthsOutputIteratorT,
@@ -286,110 +258,102 @@ struct DeviceRunLengthEncode
                                           stream);
   }
 
-  /**
-   * @brief Enumerates the starting offsets and lengths of all non-trivial runs
-   *        (of `length > 1`) of same-valued keys in the sequence `d_in`.
-   *
-   * @par
-   * - For the *i*<sup>th</sup> non-trivial run, the run's starting offset and
-   *   its length are written to `d_offsets_out[i]` and `d_lengths_out[i]`,
-   *   respectively.
-   * - The total number of runs encountered is written to `d_num_runs_out`.
-   * - The `==` equality operator is used to determine whether values are
-   *   equivalent
-   * - In-place operations are not supported. There must be no overlap between
-   *   any of the provided ranges:
-   *   - `[d_offsets_out, d_offsets_out + *d_num_runs_out)`
-   *   - `[d_lengths_out, d_lengths_out + *d_num_runs_out)`
-   *   - `[d_num_runs_out, d_num_runs_out + 1)`
-   *   - `[d_in, d_in + num_items)`
-   * - @devicestorage
-   *
-   * @par Performance
-   *
-   * @par Snippet
-   * The code snippet below illustrates the identification of non-trivial runs
-   * within a sequence of `int` values.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_run_length_encode.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int          num_items;          // e.g., 8
-   * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-   * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-   * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-   * int          *d_num_runs_out;    // e.g., [ ]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceRunLengthEncode::NonTrivialRuns(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run encoding
-   * cub::DeviceRunLengthEncode::NonTrivialRuns(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-   *
-   * // d_offsets_out         <-- [1, 4]
-   * // d_lengths_out         <-- [2, 3]
-   * // d_num_runs_out        <-- [2]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OffsetsOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing run-offset
-   *   values \iterator
-   *
-   * @tparam LengthsOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing run-length
-   *   values \iterator
-   *
-   * @tparam NumRunsOutputIteratorT
-   *   **[inferred]** Output iterator type for recording the number of runs
-   *   encountered \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to input sequence of data items
-   *
-   * @param[out] d_offsets_out
-   *   Pointer to output sequence of run-offsets
-   *   (one offset per non-trivial run)
-   *
-   * @param[out] d_lengths_out
-   *   Pointer to output sequence of run-lengths
-   *   (one count per non-trivial run)
-   *
-   * @param[out] d_num_runs_out
-   *   Pointer to total number of runs (i.e., length of `d_offsets_out`)
-   *
-   * @param[in] num_items
-   *   Total number of associated key+value pairs (i.e., the length of
-   *   `d_in_keys` and `d_in_values`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Enumerates the starting offsets and lengths of all non-trivial runs
+  //! (of ``length > 1``) of same-valued keys in the sequence ``d_in``.
+  //! 
+  //! - For the *i*\ :sup:`th` non-trivial run, the run's starting offset and
+  //!   its length are written to ``d_offsets_out[i]`` and ``d_lengths_out[i]``, respectively.
+  //! - The total number of runs encountered is written to ``d_num_runs_out``.
+  //! - The ``==`` equality operator is used to determine whether values are equivalent
+  //! - In-place operations are not supported. There must be no overlap between any of the provided ranges:
+  //!
+  //!   - ``[d_offsets_out, d_offsets_out + *d_num_runs_out)``
+  //!   - ``[d_lengths_out, d_lengths_out + *d_num_runs_out)``
+  //!   - ``[d_num_runs_out, d_num_runs_out + 1)``
+  //!   - ``[d_in, d_in + num_items)``
+  //!
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the identification of non-trivial runs
+  //! within a sequence of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_run_length_encode.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int          num_items;          // e.g., 8
+  //!    int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_num_runs_out;    // e.g., [ ]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceRunLengthEncode::NonTrivialRuns(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run encoding
+  //!    cub::DeviceRunLengthEncode::NonTrivialRuns(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+  //! 
+  //!    // d_offsets_out         <-- [1, 4]
+  //!    // d_lengths_out         <-- [2, 3]
+  //!    // d_num_runs_out        <-- [2]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OffsetsOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing run-offset values @iterator
+  //! 
+  //! @tparam LengthsOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing run-length values @iterator
+  //! 
+  //! @tparam NumRunsOutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of runs encountered @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to input sequence of data items
+  //! 
+  //! @param[out] d_offsets_out
+  //!   Pointer to output sequence of run-offsets
+  //!   (one offset per non-trivial run)
+  //! 
+  //! @param[out] d_lengths_out
+  //!   Pointer to output sequence of run-lengths (one count per non-trivial run)
+  //! 
+  //! @param[out] d_num_runs_out
+  //!   Pointer to total number of runs (i.e., length of `d_offsets_out`)
+  //! 
+  //! @param[in] num_items
+  //!   Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OffsetsOutputIteratorT,
             typename LengthsOutputIteratorT,
@@ -456,5 +420,3 @@ struct DeviceRunLengthEncode
 };
 
 CUB_NAMESPACE_END
-
-
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index 3624d7511c..28dcfc39bb 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -26,11 +26,8 @@
  *
  ******************************************************************************/
 
-/**
- * @file cub::DeviceScan provides device-wide, parallel operations for
- *       computing a prefix scan across a sequence of data items residing
- *       within device-accessible memory.
- */
+//! @file cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across 
+//!       a sequence of data items residing within device-accessible memory.
 
 #pragma once
 
@@ -51,146 +48,130 @@
 
 CUB_NAMESPACE_BEGIN
 
-
-/**
- * @brief DeviceScan provides device-wide, parallel operations for computing a
- *   prefix scan across a sequence of data items residing within
- *   device-accessible memory. ![](device_scan.png)
- *
- * @ingroup SingleModule
- *
- * @par Overview
- * Given a sequence of input elements and a binary reduction operator, a
- * [*prefix scan*](http://en.wikipedia.org/wiki/Prefix_sum) produces an output
- * sequence where each element is computed to be the reduction of the elements
- * occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
- * with the addition operator. The term *inclusive* indicates that the
- * *i*<sup>th</sup> output reduction incorporates the *i*<sup>th</sup> input.
- * The term *exclusive* indicates the *i*<sup>th</sup> input is not
- * incorporated into the *i*<sup>th</sup> output reduction. When the input and
- * output sequences are the same, the scan is performed in-place.
- *
- * @par
- * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
- * *"decoupled look-back"* algorithm for performing global prefix scan with
- * only a single pass through the input data, as described in our 2016 technical
- * report [1]. The central idea is to leverage a small, constant factor of
- * redundant work in order to overlap the latencies of global prefix
- * propagation with local computation. As such, our algorithm requires only
- * ~2*n* data movement (*n* inputs are read, *n* outputs are written), and
- * typically proceeds at "memcpy" speeds. Our algorithm supports inplace
- * operations.
- *
- * @par
- * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
- *
- * @par Usage Considerations
- * @cdp_class{DeviceScan}
- *
- * @par Performance
- * @linear_performance{prefix scan}
- *
- * @par
- * The following chart illustrates DeviceScan::ExclusiveSum performance across
- * different CUDA architectures for `int32` keys.
- * @plots_below
- *
- * @image html scan_int32.png
- *
- */
+//! @rst
+//! DeviceScan provides device-wide, parallel operations for computing a
+//! prefix scan across a sequence of data items residing within
+//! device-accessible memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! Given a sequence of input elements and a binary reduction operator, a
+//! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
+//! sequence where each element is computed to be the reduction of the elements
+//! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
+//! with the addition operator. The term *inclusive* indicates that the
+//! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
+//! The term *exclusive* indicates the *i*\ :sup:`th` input is not
+//! incorporated into the *i*\ :sup:`th` output reduction. When the input and
+//! output sequences are the same, the scan is performed in-place.
+//!
+//! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
+//! *"decoupled look-back"* algorithm for performing global prefix scan with
+//! only a single pass through the input data, as described in our 2016 technical
+//! report [1]_. The central idea is to leverage a small, constant factor of
+//! redundant work in order to overlap the latencies of global prefix
+//! propagation with local computation. As such, our algorithm requires only
+//! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and
+//! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
+//!
+//! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back
+//!    <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_, 
+//!    *NVIDIA Technical Report NVR-2016-002*, 2016.
+//!
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceScan}
+//!
+//! Performance
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @linear_performance{prefix scan}
+//!
+//! @endrst
 struct DeviceScan
 {
-  /******************************************************************//**
-   * \name Exclusive scans
-   *********************************************************************/
-  //@{
-
-  /**
-   * @brief Computes a device-wide exclusive prefix sum. The value of `0` is
-   *        applied as the initial value, and is assigned to `*d_out`.
-   *
-   * @par
-   * - Supports non-commutative sum operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
-   *   shall not overlap in any other way.
-   * - @devicestorage
-   *
-   * @par Performance
-   * The following charts illustrate saturated exclusive sum performance across
-   * different CUDA architectures for `int32` and `int64` items, respectively.
-   *
-   * @image html scan_int32.png
-   * @image html scan_int64.png
-   *
-   * @par Snippet
-   * The code snippet below illustrates the exclusive prefix sum of an `int`
-   * device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int  num_items;      // e.g., 7
-   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::ExclusiveSum(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run exclusive prefix sum
-   * cub::DeviceScan::ExclusiveSum(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, num_items);
-   *
-   * // d_out <-- [0, 8, 14, 21, 26, 29, 29]
-   *
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   inputs \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing scan
-   *   outputs \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Random-access iterator to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Random-access iterator to the output sequence of data items
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @name Exclusive scans
+  //! @{
+
+  
+  //! @rst
+  //! Computes a device-wide exclusive prefix sum. 
+  //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
+  //!
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. 
+  //!   The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix sum of an ``int``
+  //! device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run exclusive prefix sum
+  //!    cub::DeviceScan::ExclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items);
+  //!
+  //!    // d_out <-- [0, 8, 14, 21, 26, 29, 29]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Random-access iterator to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Random-access iterator to the output sequence of data items
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t
   ExclusiveSum(void *d_temp_storage,
@@ -235,81 +216,72 @@ struct DeviceScan
                                                          stream);
   }
 
-  /**
-   * @brief Computes a device-wide exclusive prefix sum in-place. The value of
-   *        `0` is applied as the initial value, and is assigned to `*d_data`.
-   *
-   * @par
-   * - Supports non-commutative sum operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - @devicestorage
-   *
-   * @par Performance
-   * The following charts illustrate saturated exclusive sum performance across
-   * different CUDA architectures for `int32` and `int64` items, respectively.
-   *
-   * @image html scan_int32.png
-   * @image html scan_int64.png
-   *
-   * @par Snippet
-   * The code snippet below illustrates the exclusive prefix sum of an `int`
-   * device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int  num_items;      // e.g., 7
-   * int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::ExclusiveSum(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run exclusive prefix sum
-   * cub::DeviceScan::ExclusiveSum(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, num_items);
-   *
-   * // d_data <-- [0, 8, 14, 21, 26, 29, 29]
-   *
-   * @endcode
-   *
-   * @tparam IteratorT
-   *   **[inferred]** Random-access iterator type for reading scan
-   *   inputs and wrigin scan outputs
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_data
-   *   Random-access iterator to the sequence of data items
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide exclusive prefix sum in-place. 
+  //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
+  //! 
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix sum of an ``int``
+  //! device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run exclusive prefix sum
+  //!    cub::DeviceScan::ExclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items);
+  //! 
+  //!    // d_data <-- [0, 8, 14, 21, 26, 29, 29]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_data
+  //!   Random-access iterator to the sequence of data items
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename IteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t
   ExclusiveSum(void *d_temp_storage,
@@ -345,113 +317,108 @@ struct DeviceScan
                                    stream);
   }
 
-  /**
-   * @brief Computes a device-wide exclusive prefix scan using the specified
-   *        binary `scan_op` functor. The `init_value` value is applied as
-   *        the initial value, and is assigned to `*d_out`.
-   *
-   * @par
-   * - Supports non-commutative scan operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
-   *   shall not overlap in any other way.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan of an
-   * `int` device vector
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   * #include <climits>       // for INT_MAX
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;      // e.g., 7
-   * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-   * CustomMin    min_op;
-   * ...
-   *
-   * // Determine temporary device storage requirements for exclusive
-   * // prefix scan
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, min_op, (int) INT_MAX, num_items);
-   *
-   * // Allocate temporary storage for exclusive prefix scan
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run exclusive prefix min-scan
-   * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, min_op, (int) INT_MAX, num_items);
-   *
-   * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
-   *
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   inputs \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing scan
-   *   outputs \iterator
-   *
-   * @tparam ScanOp
-   *   **[inferred]** Binary scan functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam InitValueT
-   *  **[inferred]** Type of the `init_value` used Binary scan functor type
-   *   having member `T operator()(const T &a, const T &b)`
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Random-access iterator to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Random-access iterator to the output sequence of data items
-   *
-   * @param[in] scan_op
-   *   Binary scan functor
-   *
-   * @param[in] init_value
-   *   Initial value to seed the exclusive scan (and is assigned to *d_out)
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of \p d_in)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan using the specified
+  //! binary ``scan_op`` functor. The ``init_value`` value is applied as
+  //! the initial value, and is assigned to ``*d_out``.
+  //! 
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
+  //!   range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    CustomMin    min_op;
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, (int) INT_MAX, num_items);
+  //! 
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, (int) INT_MAX, num_items);
+  //! 
+  //!    // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //! 
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam InitValueT
+  //!  **[inferred]** Type of the `init_value` used Binary scan functor type
+  //!   having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Random-access iterator to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Random-access iterator to the output sequence of data items
+  //! 
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //! 
+  //! @param[in] init_value
+  //!   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename ScanOpT,
@@ -513,102 +480,99 @@ struct DeviceScan
       stream);
   }
 
-  /**
-   * @brief Computes a device-wide exclusive prefix scan using the specified
-   *        binary `scan_op` functor. The `init_value` value is applied as
-   *        the initial value, and is assigned to `*d_data`.
-   *
-   * @par
-   * - Supports non-commutative scan operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan of an
-   * `int` device vector
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   * #include <climits>       // for INT_MAX
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;      // e.g., 7
-   * int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * CustomMin    min_op;
-   * ...
-   *
-   * // Determine temporary device storage requirements for exclusive
-   * // prefix scan
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, min_op, (int) INT_MAX, num_items);
-   *
-   * // Allocate temporary storage for exclusive prefix scan
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run exclusive prefix min-scan
-   * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, min_op, (int) INT_MAX, num_items);
-   *
-   * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
-   *
-   * @endcode
-   *
-   * @tparam IteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   inputs and writing scan outputs
-   *
-   * @tparam ScanOp
-   *   **[inferred]** Binary scan functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam InitValueT
-   *  **[inferred]** Type of the `init_value` used Binary scan functor type
-   *   having member `T operator()(const T &a, const T &b)`
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_data
-   *   Random-access iterator to the sequence of data items
-   *
-   * @param[in] scan_op
-   *   Binary scan functor
-   *
-   * @param[in] init_value
-   *   Initial value to seed the exclusive scan (and is assigned to *d_out)
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of \p d_in)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan using the specified
+  //! binary ``scan_op`` functor. The ``init_value`` value is applied as
+  //! the initial value, and is assigned to ``*d_data``.
+  //! 
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan of an
+  //! ``int`` device vector:
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    CustomMin    min_op;
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, (int) INT_MAX, num_items);
+  //! 
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, (int) INT_MAX, num_items);
+  //! 
+  //!    // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+  //! 
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam InitValueT
+  //!  **[inferred]** Type of the `init_value` used Binary scan functor type
+  //!   having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_data
+  //!   Random-access iterator to the sequence of data items
+  //! 
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //! 
+  //! @param[in] init_value
+  //!   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename IteratorT,
             typename ScanOpT,
             typename InitValueT>
@@ -656,118 +620,112 @@ struct DeviceScan
                                                          stream);
   }
 
-  /**
-   * @brief Computes a device-wide exclusive prefix scan using the specified
-   *        binary `scan_op` functor. The `init_value` value is provided as
-   *        a future value.
-   *
-   * @par
-   * - Supports non-commutative scan operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
-   *   shall not overlap in any other way.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan of an
-   * `int` device vector
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   * #include <climits>       // for INT_MAX
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;      // e.g., 7
-   * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-   * int          *d_init_iter;   // e.g., INT_MAX
-   * CustomMin    min_op;
-   *
-   * auto future_init_value =
-   *   cub::FutureValue<InitialValueT, IterT>(d_init_iter);
-   *
-   * ...
-   *
-   * // Determine temporary device storage requirements for exclusive
-   * // prefix scan
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, min_op, future_init_value, num_items);
-   *
-   * // Allocate temporary storage for exclusive prefix scan
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run exclusive prefix min-scan
-   * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, min_op, future_init_value, num_items);
-   *
-   * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
-   *
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   inputs \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing scan
-   *   outputs \iterator
-   *
-   * @tparam ScanOp
-   *   **[inferred]** Binary scan functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam InitValueT
-   *  **[inferred]** Type of the `init_value` used Binary scan functor type
-   *   having member `T operator()(const T &a, const T &b)`
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of \p d_temp_storage allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output sequence of data items
-   *
-   * @param[in] scan_op
-   *   Binary scan functor
-   *
-   * @param[in] init_value
-   *   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan using the specified
+  //! binary ``scan_op`` functor. The ``init_value`` value is provided as a future value.
+  //! 
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. 
+  //!   The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_init_iter;   // e.g., INT_MAX
+  //!    CustomMin    min_op;
+  //! 
+  //!    auto future_init_value =
+  //!      cub::FutureValue<InitialValueT, IterT>(d_init_iter);
+  //! 
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, future_init_value, num_items);
+  //! 
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, future_init_value, num_items);
+  //! 
+  //!    // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //! 
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam InitValueT
+  //!  **[inferred]** Type of the `init_value` used Binary scan functor type
+  //!   having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of data items
+  //! 
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //! 
+  //! @param[in] init_value
+  //!   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename ScanOpT,
@@ -834,107 +792,102 @@ struct DeviceScan
                                          stream);
   }
 
-  /**
-   * @brief Computes a device-wide exclusive prefix scan using the specified
-   *        binary `scan_op` functor. The `init_value` value is provided as
-   *        a future value.
-   *
-   * @par
-   * - Supports non-commutative scan operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan of an
-   * `int` device vector
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   * #include <climits>       // for INT_MAX
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;      // e.g., 7
-   * int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int          *d_init_iter;   // e.g., INT_MAX
-   * CustomMin    min_op;
-   *
-   * auto future_init_value =
-   *   cub::FutureValue<InitialValueT, IterT>(d_init_iter);
-   *
-   * ...
-   *
-   * // Determine temporary device storage requirements for exclusive
-   * // prefix scan
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, min_op, future_init_value, num_items);
-   *
-   * // Allocate temporary storage for exclusive prefix scan
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run exclusive prefix min-scan
-   * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, min_op, future_init_value, num_items);
-   *
-   * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
-   *
-   * @endcode
-   *
-   * @tparam IteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   inputs and writing scan outputs
-   *
-   * @tparam ScanOp
-   *   **[inferred]** Binary scan functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam InitValueT
-   *  **[inferred]** Type of the `init_value` used Binary scan functor type
-   *   having member `T operator()(const T &a, const T &b)`
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of \p d_temp_storage allocation
-   *
-   * @param[in,out] d_data
-   *   Pointer to the sequence of data items
-   *
-   * @param[in] scan_op
-   *   Binary scan functor
-   *
-   * @param[in] init_value
-   *   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor. 
+  //! The ``init_value`` value is provided as a future value.
+  //! 
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_init_iter;   // e.g., INT_MAX
+  //!    CustomMin    min_op;
+  //! 
+  //!    auto future_init_value =
+  //!      cub::FutureValue<InitialValueT, IterT>(d_init_iter);
+  //! 
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, future_init_value, num_items);
+  //! 
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, future_init_value, num_items);
+  //! 
+  //!    // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+  //! 
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam InitValueT
+  //!  **[inferred]** Type of the `init_value` used Binary scan functor type
+  //!   having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_data
+  //!   Pointer to the sequence of data items
+  //! 
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //! 
+  //! @param[in] init_value
+  //!   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename IteratorT,
             typename ScanOpT,
             typename InitValueT,
@@ -985,93 +938,85 @@ struct DeviceScan
       stream);
   }
 
-  //@}  end member group
-  /******************************************************************//**
-   * @name Inclusive scans
-   *********************************************************************/
-  //@{
-
-
-  /**
-   * @brief Computes a device-wide inclusive prefix sum.
-   *
-   * @par
-   * - Supports non-commutative sum operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
-   *   shall not overlap in any other way.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the inclusive prefix sum of an `int`
-   * device vector.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int  num_items;      // e.g., 7
-   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-   * ...
-   *
-   * // Determine temporary device storage requirements for inclusive
-   * // prefix sum
-   * void     *d_temp_storage = nullptr;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::InclusiveSum(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, num_items);
-   *
-   * // Allocate temporary storage for inclusive prefix sum
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run inclusive prefix sum
-   * cub::DeviceScan::InclusiveSum(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, num_items);
-   *
-   * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
-   *
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   inputs \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing scan
-   *   outputs \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Random-access iterator to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Random-access iterator to the output sequence of data items
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @}  end member group
+  //! @name Inclusive scans
+  //! @{
+
+  //! @rst
+  //! Computes a device-wide inclusive prefix sum.
+  //! 
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
+  //!   range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for inclusive
+  //!    // prefix sum
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items);
+  //! 
+  //!    // Allocate temporary storage for inclusive prefix sum
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run inclusive prefix sum
+  //!    cub::DeviceScan::InclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items);
+  //! 
+  //!    // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Random-access iterator to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Random-access iterator to the output sequence of data items
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t
   InclusiveSum(void *d_temp_storage,
@@ -1119,75 +1064,71 @@ struct DeviceScan
                                                          stream);
   }
 
-  /**
-   * @brief Computes a device-wide inclusive prefix sum in-place.
-   *
-   * @par
-   * - Supports non-commutative sum operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the inclusive prefix sum of an `int`
-   * device vector.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int  num_items;      // e.g., 7
-   * int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * ...
-   *
-   * // Determine temporary device storage requirements for inclusive
-   * // prefix sum
-   * void     *d_temp_storage = nullptr;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::InclusiveSum(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, num_items);
-   *
-   * // Allocate temporary storage for inclusive prefix sum
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run inclusive prefix sum
-   * cub::DeviceScan::InclusiveSum(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, num_items);
-   *
-   * // d_data <-- [8, 14, 21, 26, 29, 29, 38]
-   *
-   * @endcode
-   *
-   * @tparam IteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   inputs and writing scan outputs
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_data
-   *   Random-access iterator to the sequence of data items
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide inclusive prefix sum in-place.
+  //! 
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for inclusive
+  //!    // prefix sum
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items);
+  //! 
+  //!    // Allocate temporary storage for inclusive prefix sum
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run inclusive prefix sum
+  //!    cub::DeviceScan::InclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items);
+  //! 
+  //!    // d_data <-- [8, 14, 21, 26, 29, 29, 38]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_data
+  //!   Random-access iterator to the sequence of data items
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename IteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t
   InclusiveSum(void *d_temp_storage,
@@ -1223,106 +1164,100 @@ struct DeviceScan
                                    stream);
   }
 
-  /**
-   * @brief Computes a device-wide inclusive prefix scan using the specified
-   *        binary `scan_op` functor.
-   *
-   * @par
-   * - Supports non-commutative scan operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
-   *   shall not overlap in any other way.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the inclusive prefix min-scan of an
-   * `int` device vector.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   * #include <climits>       // for INT_MAX
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;      // e.g., 7
-   * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-   * CustomMin    min_op;
-   * ...
-   *
-   * // Determine temporary device storage requirements for inclusive
-   * // prefix scan
-   * void *d_temp_storage = nullptr;
-   * size_t temp_storage_bytes = 0;
-   * cub::DeviceScan::InclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, min_op, num_items);
-   *
-   * // Allocate temporary storage for inclusive prefix scan
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run inclusive prefix min-scan
-   * cub::DeviceScan::InclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, min_op, num_items);
-   *
-   * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
-   *
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   inputs \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing scan
-   *   outputs \iterator
-   *
-   * @tparam ScanOp
-   *   **[inferred]** Binary scan functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @param[in]
-   *   d_temp_storage Device-accessible allocation of temporary storage.
-   *   When `nullptr`, the required allocation size is written to
-   *   `temp_storage_bytes` and no work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Random-access iterator to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Random-access iterator to the output sequence of data items
-   *
-   * @param[in] scan_op
-   *   Binary scan functor
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
+  //! 
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
+  //!   range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    CustomMin    min_op;
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for inclusive
+  //!    // prefix scan
+  //!    void *d_temp_storage = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, num_items);
+  //! 
+  //!    // Allocate temporary storage for inclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run inclusive prefix min-scan
+  //!    cub::DeviceScan::InclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, num_items);
+  //! 
+  //!    // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //! 
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @param[in]
+  //!   d_temp_storage Device-accessible allocation of temporary storage.
+  //!   When `nullptr`, the required allocation size is written to
+  //!   `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Random-access iterator to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Random-access iterator to the output sequence of data items
+  //! 
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT>
   CUB_RUNTIME_FUNCTION static cudaError_t
   InclusiveScan(void *d_temp_storage,
@@ -1374,95 +1309,90 @@ struct DeviceScan
       stream);
   }
 
-  /**
-   * @brief Computes a device-wide inclusive prefix scan using the specified
-   *        binary `scan_op` functor.
-   *
-   * @par
-   * - Supports non-commutative scan operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the inclusive prefix min-scan of an
-   * `int` device vector.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   * #include <climits>       // for INT_MAX
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;      // e.g., 7
-   * int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * CustomMin    min_op;
-   * ...
-   *
-   * // Determine temporary device storage requirements for inclusive
-   * // prefix scan
-   * void *d_temp_storage = nullptr;
-   * size_t temp_storage_bytes = 0;
-   * cub::DeviceScan::InclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, min_op, num_items);
-   *
-   * // Allocate temporary storage for inclusive prefix scan
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run inclusive prefix min-scan
-   * cub::DeviceScan::InclusiveScan(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, min_op, num_items);
-   *
-   * // d_data <-- [8, 6, 6, 5, 3, 0, 0]
-   *
-   * @endcode
-   *
-   * @tparam IteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   inputs and writing scan outputs
-   *
-   * @tparam ScanOp
-   *   **[inferred]** Binary scan functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @param[in]
-   *   d_temp_storage Device-accessible allocation of temporary storage.
-   *   When `nullptr`, the required allocation size is written to
-   *   `temp_storage_bytes` and no work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_data
-   *   Random-access iterator to the sequence of data items
-   *
-   * @param[in] scan_op
-   *   Binary scan functor
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
+  //! 
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    CustomMin    min_op;
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for inclusive
+  //!    // prefix scan
+  //!    void *d_temp_storage = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, num_items);
+  //! 
+  //!    // Allocate temporary storage for inclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run inclusive prefix min-scan
+  //!    cub::DeviceScan::InclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, num_items);
+  //! 
+  //!    // d_data <-- [8, 6, 6, 5, 3, 0, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+  //! 
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @param[in]
+  //!   d_temp_storage Device-accessible allocation of temporary storage.
+  //!   When `nullptr`, the required allocation size is written to
+  //!   `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_data
+  //!   Random-access iterator to the sequence of data items
+  //! 
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename IteratorT, typename ScanOpT>
   CUB_RUNTIME_FUNCTION static cudaError_t
   InclusiveScan(void *d_temp_storage,
@@ -1502,108 +1432,100 @@ struct DeviceScan
                                              stream);
   }
 
-  /**
-   * @brief Computes a device-wide exclusive prefix sum-by-key with key equality
-   *        defined by `equality_op`. The value of `0` is applied as the initial
-   *        value, and is assigned to the beginning of each segment in
-   *        `d_values_out`.
-   *
-   * @par
-   * - Supports non-commutative sum operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - `d_keys_in` may equal `d_values_out` but the range
-   *   `[d_keys_in, d_keys_in + num_items)` and the range
-   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - `d_values_in` may equal `d_values_out` but the range
-   *   `[d_values_in, d_values_in + num_items)` and the range
-   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the exclusive prefix sum-by-key of an
-   * `int` device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int num_items;      // e.g., 7
-   * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
-   * int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = nullptr;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::ExclusiveSumByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in, d_values_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run exclusive prefix sum
-   * cub::DeviceScan::ExclusiveSumByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in, d_values_out, num_items);
-   *
-   * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
-   *
-   * @endcode
-   *
-   * @tparam KeysInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan keys
-   *   inputs \iterator
-   *
-   * @tparam ValuesInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   values inputs \iterator
-   *
-   * @tparam ValuesOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing scan
-   *   values outputs \iterator
-   *
-   * @tparam EqualityOpT
-   *   **[inferred]** Functor type having member
-   *   `T operator()(const T &a, const T &b)` for binary operations that
-   *   defines the equality of keys
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_keys_in
-   *   Random-access input iterator to the input sequence of key items
-   *
-   * @param[in] d_values_in
-   *   Random-access input iterator to the input sequence of value items
-   *
-   * @param[out] d_values_out
-   *   Random-access output iterator to the output sequence of value items
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., the length of `d_keys_in` and
-   *   `d_values_in`)
-   *
-   * @param[in] equality_op
-   *   Binary functor that defines the equality of keys.
-   *   Default is cub::Equality().
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide exclusive prefix sum-by-key with key equality
+  //! defined by ``equality_op``. The value of ``0`` is applied as the initial
+  //! value, and is assigned to the beginning of each segment in ``d_values_out``.
+  //! 
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - ``d_keys_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_keys_in, d_keys_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - ``d_values_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_values_in, d_values_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int num_items;      // e.g., 7
+  //!    int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+  //!    int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveSumByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run exclusive prefix sum
+  //!    cub::DeviceScan::ExclusiveSumByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, num_items);
+  //! 
+  //!    // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
+  //! 
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
+  //! 
+  //! @tparam ValuesOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
+  //! 
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Functor type having member
+  //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Random-access input iterator to the input sequence of key items
+  //! 
+  //! @param[in] d_values_in
+  //!   Random-access input iterator to the input sequence of value items
+  //! 
+  //! @param[out] d_values_out
+  //!   Random-access output iterator to the output sequence of value items
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+  //! 
+  //! @param[in] equality_op
+  //!   Binary functor that defines the equality of keys.
+  //!   Default is cub::Equality().
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -1674,150 +1596,142 @@ struct DeviceScan
                                           stream);
   }
 
-  /**
-   * @brief Computes a device-wide exclusive prefix scan-by-key using the
-   *        specified binary `scan_op` functor. The key equality is defined by
-   *        `equality_op`.  The `init_value` value is applied as the initial
-   *        value, and is assigned to the beginning of each segment in
-   *        `d_values_out`.
-   *
-   * @par
-   * - Supports non-commutative scan operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - `d_keys_in` may equal `d_values_out` but the range
-   *   `[d_keys_in, d_keys_in + num_items)` and the range
-   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - `d_values_in` may equal `d_values_out` but the range
-   *   `[d_values_in, d_values_in + num_items)` and the range
-   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan-by-key of
-   * an `int` device vector
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   * #include <climits>       // for INT_MAX
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // CustomEqual functor
-   * struct CustomEqual
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return a == b;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;      // e.g., 7
-   * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
-   * int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
-   * CustomMin    min_op;
-   * CustomEqual  equality_op;
-   * ...
-   *
-   * // Determine temporary device storage requirements for exclusive
-   * // prefix scan
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::ExclusiveScanByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in, d_values_out, min_op,
-   *   (int) INT_MAX, num_items, equality_op);
-   *
-   * // Allocate temporary storage for exclusive prefix scan
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run exclusive prefix min-scan
-   * cub::DeviceScan::ExclusiveScanByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in, d_values_out, min_op,
-   *   (int) INT_MAX, num_items, equality_op);
-   *
-   * // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
-   *
-   * @endcode
-   *
-   * @tparam KeysInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan keys
-   *   inputs \iterator
-   *
-   * @tparam ValuesInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan values
-   *   inputs \iterator
-   *
-   * @tparam ValuesOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing scan values
-   *   outputs \iterator
-   *
-   * @tparam ScanOp
-   *   **[inferred]** Binary scan functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam InitValueT
-   *   **[inferred]** Type of the `init_value` value used in Binary scan
-   *   functor type having member `T operator()(const T &a, const T &b)`
-   *
-   * @tparam EqualityOpT
-   *   **[inferred]** Functor type having member
-   *   `T operator()(const T &a, const T &b)` for binary operations that
-   *   defines the equality of keys
-   *
-   *  @param[in] d_temp_storage
-   *    Device-accessible allocation of temporary storage. When `nullptr`, the
-   *    required allocation size is written to `temp_storage_bytes` and no work
-   *    is done.
-   *
-   *  @param[in,out] temp_storage_bytes
-   *    Reference to size in bytes of `d_temp_storage` allocation
-   *
-   *  @param[in] d_keys_in
-   *    Random-access input iterator to the input sequence of key items
-   *
-   *  @param[in] d_values_in
-   *    Random-access input iterator to the input sequence of value items
-   *
-   *  @param[out] d_values_out
-   *    Random-access output iterator to the output sequence of value items
-   *
-   *  @param[in] scan_op
-   *    Binary scan functor
-   *
-   *  @param[in] init_value
-   *    Initial value to seed the exclusive scan (and is assigned to the
-   *    beginning of each segment in `d_values_out`)
-   *
-   *  @param[in] num_items
-   *    Total number of input items (i.e., the length of `d_keys_in` and
-   *    `d_values_in`)
-   *
-   *  @param[in] equality_op
-   *    Binary functor that defines the equality of keys.
-   *    Default is cub::Equality().
-   *
-   *  @param[in] stream
-   *    **[optional]** CUDA stream to launch kernels within.
-   *    Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan-by-key using the
+  //! specified binary ``scan_op`` functor. The key equality is defined by
+  //! ``equality_op``.  The ``init_value`` value is applied as the initial
+  //! value, and is assigned to the beginning of each segment in ``d_values_out``.
+  //! 
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - ``d_keys_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_keys_in, d_keys_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - ``d_values_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_values_in, d_values_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // CustomEqual functor
+  //!    struct CustomEqual
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return a == b;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+  //!    int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    CustomMin    min_op;
+  //!    CustomEqual  equality_op;
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScanByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, min_op,
+  //!      (int) INT_MAX, num_items, equality_op);
+  //! 
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScanByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, min_op,
+  //!      (int) INT_MAX, num_items, equality_op);
+  //! 
+  //!    // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
+  //! 
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
+  //! 
+  //! @tparam ValuesOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
+  //! 
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam InitValueT
+  //!   **[inferred]** Type of the `init_value` value used in Binary scan
+  //!   functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Functor type having member
+  //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+  //! 
+  //!  @param[in] d_temp_storage
+  //!    Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!    required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //!  @param[in,out] temp_storage_bytes
+  //!    Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //!  @param[in] d_keys_in
+  //!    Random-access input iterator to the input sequence of key items
+  //! 
+  //!  @param[in] d_values_in
+  //!    Random-access input iterator to the input sequence of value items
+  //! 
+  //!  @param[out] d_values_out
+  //!    Random-access output iterator to the output sequence of value items
+  //! 
+  //!  @param[in] scan_op
+  //!    Binary scan functor
+  //! 
+  //!  @param[in] init_value
+  //!    Initial value to seed the exclusive scan (and is assigned to the
+  //!    beginning of each segment in `d_values_out`)
+  //! 
+  //!  @param[in] num_items
+  //!    Total number of input items (i.e., the length of `d_keys_in` and
+  //!    `d_values_in`)
+  //! 
+  //!  @param[in] equality_op
+  //!    Binary functor that defines the equality of keys.
+  //!    Default is cub::Equality().
+  //! 
+  //!  @param[in] stream
+  //!    @rst
+  //!    **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!    @endrst
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -1896,106 +1810,98 @@ struct DeviceScan
                                            stream);
   }
 
-  /**
-   * @brief Computes a device-wide inclusive prefix sum-by-key with key
-   *        equality defined by `equality_op`.
-   *
-   * @par
-   * - Supports non-commutative sum operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - `d_keys_in` may equal `d_values_out` but the range
-   *   `[d_keys_in, d_keys_in + num_items)` and the range
-   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - `d_values_in` may equal `d_values_out` but the range
-   *   `[d_values_in, d_values_in + num_items)` and the range
-   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the inclusive prefix sum-by-key of an
-   * `int` device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int num_items;      // e.g., 7
-   * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
-   * int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
-   * ...
-   *
-   * // Determine temporary device storage requirements for inclusive prefix sum
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceScan::InclusiveSumByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in, d_values_out, num_items);
-   *
-   * // Allocate temporary storage for inclusive prefix sum
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run inclusive prefix sum
-   * cub::DeviceScan::InclusiveSumByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in, d_values_out, num_items);
-   *
-   * // d_out <-- [8, 14, 7, 12, 15, 0, 9]
-   *
-   * @endcode
-   *
-   * @tparam KeysInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   keys inputs \iterator
-   *
-   * @tparam ValuesInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   values inputs \iterator
-   *
-   * @tparam ValuesOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing scan
-   *   values outputs \iterator
-   *
-   * @tparam EqualityOpT
-   *   **[inferred]** Functor type having member
-   *   `T operator()(const T &a, const T &b)` for binary operations that
-   *   defines the equality of keys
-   *
-   *  @param[in] d_temp_storage
-   *    Device-accessible allocation of temporary storage.
-   *    When `nullptr`, the required allocation size is written to
-   *    `temp_storage_bytes` and no work is done.
-   *
-   *  @param[in,out] temp_storage_bytes
-   *    Reference to size in bytes of `d_temp_storage` allocation
-   *
-   *  @param[in] d_keys_in
-   *    Random-access input iterator to the input sequence of key items
-   *
-   *  @param[in] d_values_in
-   *    Random-access input iterator to the input sequence of value items
-   *
-   *  @param[out] d_values_out
-   *    Random-access output iterator to the output sequence of value items
-   *
-   *  @param[in] num_items
-   *    Total number of input items (i.e., the length of `d_keys_in` and
-   *    `d_values_in`)
-   *
-   *  @param[in] equality_op
-   *    Binary functor that defines the equality of keys.
-   *    Default is cub::Equality().
-   *
-   *  @param[in] stream
-   *    **[optional]** CUDA stream to launch kernels within.
-   *    Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
+  //! 
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - ``d_keys_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_keys_in, d_keys_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - ``d_values_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_values_in, d_values_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int num_items;      // e.g., 7
+  //!    int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+  //!    int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for inclusive prefix sum
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveSumByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, num_items);
+  //! 
+  //!    // Allocate temporary storage for inclusive prefix sum
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run inclusive prefix sum
+  //!    cub::DeviceScan::InclusiveSumByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, num_items);
+  //! 
+  //!    // d_out <-- [8, 14, 7, 12, 15, 0, 9]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
+  //! 
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
+  //! 
+  //! @tparam ValuesOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
+  //! 
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Functor type having member
+  //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+  //! 
+  //!  @param[in] d_temp_storage
+  //!    Device-accessible allocation of temporary storage.
+  //!    When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //!  @param[in,out] temp_storage_bytes
+  //!    Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //!  @param[in] d_keys_in
+  //!    Random-access input iterator to the input sequence of key items
+  //! 
+  //!  @param[in] d_values_in
+  //!    Random-access input iterator to the input sequence of value items
+  //! 
+  //!  @param[out] d_values_out
+  //!    Random-access output iterator to the output sequence of value items
+  //! 
+  //!  @param[in] num_items
+  //!    Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+  //! 
+  //!  @param[in] equality_op
+  //!    Binary functor that defines the equality of keys.
+  //!    Default is cub::Equality().
+  //! 
+  //!  @param[in] stream
+  //!    @rst
+  //!    **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!    @endrst
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -2062,137 +1968,128 @@ struct DeviceScan
                                           stream);
   }
 
-  /**
-   * @brief Computes a device-wide inclusive prefix scan-by-key using the
-   *        specified binary `scan_op` functor. The key equality is defined
-   *        by `equality_op`.
-   *
-   * @par
-   * - Supports non-commutative scan operators.
-   * - Results are not deterministic for pseudo-associative operators (e.g.,
-   *   addition of floating-point types). Results for pseudo-associative
-   *   operators may vary from run to run. Additional details can be found in
-   *   the [decoupled look-back] description.
-   * - `d_keys_in` may equal `d_values_out` but the range
-   *   `[d_keys_in, d_keys_in + num_items)` and the range
-   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - `d_values_in` may equal `d_values_out` but the range
-   *   `[d_values_in, d_values_in + num_items)` and the range
-   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the inclusive prefix min-scan-by-key
-   * of an `int` device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-   * #include <climits>       // for INT_MAX
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // CustomEqual functor
-   * struct CustomEqual
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return a == b;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // input and output
-   * int          num_items;      // e.g., 7
-   * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
-   * int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
-   * CustomMin    min_op;
-   * CustomEqual  equality_op;
-   * ...
-   *
-   * // Determine temporary device storage requirements for inclusive prefix scan
-   * void *d_temp_storage = NULL;
-   * size_t temp_storage_bytes = 0;
-   * cub::DeviceScan::InclusiveScanByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
-   *
-   * // Allocate temporary storage for inclusive prefix scan
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run inclusive prefix min-scan
-   * cub::DeviceScan::InclusiveScanByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
-   *
-   * // d_out <-- [8, 6, 7, 5, 3, 0, 0]
-   *
-   * @endcode
-   *
-   * @tparam KeysInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan keys
-   *   inputs \iterator
-   *
-   * @tparam ValuesInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading scan
-   *   values inputs \iterator
-   *
-   * @tparam ValuesOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing scan
-   *   values outputs \iterator
-   *
-   * @tparam ScanOp
-   *   **[inferred]** Binary scan functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam EqualityOpT
-   *   **[inferred]** Functor type having member
-   *   `T operator()(const T &a, const T &b)` for binary operations that
-   *   defines the equality of keys
-   *
-   *  @param[in] d_temp_storage
-   *    Device-accessible allocation of temporary storage.
-   *    When `nullptr`, the required allocation size is written to
-   *    `temp_storage_bytes` and no work is done.
-   *
-   *  @param[in,out] temp_storage_bytes
-   *    Reference to size in bytes of `d_temp_storage` allocation
-   *
-   *  @param[in] d_keys_in
-   *    Random-access input iterator to the input sequence of key items
-   *
-   *  @param[in] d_values_in
-   *    Random-access input iterator to the input sequence of value items
-   *
-   *  @param[out] d_values_out
-   *    Random-access output iterator to the output sequence of value items
-   *
-   *  @param[in] scan_op
-   *    Binary scan functor
-   *
-   *  @param[in] num_items
-   *    Total number of input items (i.e., the length of `d_keys_in` and
-   *    `d_values_in`)
-   *
-   *  @param[in] equality_op
-   *    Binary functor that defines the equality of keys.
-   *    Default is cub::Equality().
-   *
-   *  @param[in] stream
-   *    **[optional]** CUDA stream to launch kernels within.
-   *    Default is stream<sub>0</sub>.
-   *
-   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-   */
+  //! @rst
+  //! Computes a device-wide inclusive prefix scan-by-key using the
+  //! specified binary ``scan_op`` functor. The key equality is defined by ``equality_op``.
+  //! 
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - ``d_keys_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_keys_in, d_keys_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - ``d_values_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_values_in, d_values_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // CustomEqual functor
+  //!    struct CustomEqual
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return a == b;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+  //!    int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    CustomMin    min_op;
+  //!    CustomEqual  equality_op;
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements for inclusive prefix scan
+  //!    void *d_temp_storage = NULL;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveScanByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+  //! 
+  //!    // Allocate temporary storage for inclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run inclusive prefix min-scan
+  //!    cub::DeviceScan::InclusiveScanByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+  //! 
+  //!    // d_out <-- [8, 6, 7, 5, 3, 0, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
+  //! 
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
+  //! 
+  //! @tparam ValuesOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
+  //! 
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Functor type having member
+  //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+  //! 
+  //!  @param[in] d_temp_storage
+  //!    Device-accessible allocation of temporary storage.
+  //!    When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //!  @param[in,out] temp_storage_bytes
+  //!    Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //!  @param[in] d_keys_in
+  //!    Random-access input iterator to the input sequence of key items
+  //! 
+  //!  @param[in] d_values_in
+  //!    Random-access input iterator to the input sequence of value items
+  //! 
+  //!  @param[out] d_values_out
+  //!    Random-access output iterator to the output sequence of value items
+  //! 
+  //!  @param[in] scan_op
+  //!    Binary scan functor
+  //! 
+  //!  @param[in] num_items
+  //!    Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+  //! 
+  //!  @param[in] equality_op
+  //!    Binary functor that defines the equality of keys.
+  //!    Default is cub::Equality().
+  //! 
+  //!  @param[in] stream
+  //!    @rst
+  //!    **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!    @endrst
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -2265,13 +2162,9 @@ struct DeviceScan
                                            stream);
   }
 
-  //@}  end member group
+  //! @}  end member group
 };
 
-/**
- * @example example_device_scan.cu
- */
-
 CUB_NAMESPACE_END
 
 
diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh
index 3636165fff..744c854d33 100644
--- a/cub/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/cub/device/device_segmented_radix_sort.cuh
@@ -26,12 +26,8 @@
  *
  ******************************************************************************/
 
-/**
- * @file cub::DeviceSegmentedRadixSort provides device-wide, parallel
- *       operations for computing a batched radix sort across multiple,
- *       non-overlapping sequences of data items residing within
- *       device-accessible memory.
- */
+//! @file cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort 
+//!       across multiple, non-overlapping sequences of data items residing within device-accessible memory.
 
 #pragma once
 
@@ -54,175 +50,172 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @brief DeviceSegmentedRadixSort provides device-wide, parallel operations
- *        for computing a batched radix sort across multiple, non-overlapping
- *        sequences of data items residing within device-accessible memory.
- *        ![](segmented_sorting_logo.png)
- * @ingroup SegmentedModule
- *
- * @par Overview
- * The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort)
- * arranges items into ascending (or descending) order. The algorithm relies
- * upon a positional representation for keys, i.e., each key is comprised of an
- * ordered sequence of symbols (e.g., digits, characters, etc.) specified from
- * least-significant to most-significant.  For a given input sequence of keys
- * and a set of rules specifying a total ordering of the symbolic alphabet, the
- * radix sorting method produces a lexicographic ordering of those keys.
- *
- * @par See Also
- * DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See
- * that algorithm's documentation for more information.
- *
- * @par Segments are not required to be contiguous. Any element of input(s) or
- * output(s) outside the specified segments will not be accessed nor modified.
- *
- * @par Usage Considerations
- * @cdp_class{DeviceSegmentedRadixSort}
- *
- */
+//! @rst
+//! DeviceSegmentedRadixSort provides device-wide, parallel operations
+//! for computing a batched radix sort across multiple, non-overlapping
+//! sequences of data items residing within device-accessible memory.
+//! 
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_
+//! arranges items into ascending (or descending) order. The algorithm relies
+//! upon a positional representation for keys, i.e., each key is comprised of an
+//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
+//! least-significant to most-significant.  For a given input sequence of keys
+//! and a set of rules specifying a total ordering of the symbolic alphabet, the
+//! radix sorting method produces a lexicographic ordering of those keys.
+//! 
+//! See Also
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See
+//! that algorithm's documentation for more information.
+//! 
+//! Segments are not required to be contiguous. Any element of input(s) or
+//! output(s) outside the specified segments will not be accessed nor modified.
+//! 
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceSegmentedRadixSort}
+//! 
+//! @endrst
 struct DeviceSegmentedRadixSort
 {
-  /******************************************************************//**
-   * @name Key-value pairs
-   *********************************************************************/
-  //@{
+  //! @name Key-value pairs
+  //! @{
 
-  /**
-   * @brief Sorts segments of key-value pairs into ascending order.
-   *        (`~2N` auxiliary storage required)
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
-   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
-   *   the latter is specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
-   *   bits can be specified. This can reduce overall sorting overhead and
-   *   yield a corresponding performance improvement.
-   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
-   *   not overlap `[in, in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see
-   *   the sorting interface using DoubleBuffer wrappers below.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of `int` keys with associated vector of
-   * `int` values.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedRadixSort::SortPairs(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedRadixSort::SortPairs(
-         d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-   * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   **[inferred]** Key type
-   *
-   * @tparam ValueT
-   *   **[inferred]** Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] d_values_in
-   *   Device-accessible pointer to the corresponding input sequence of
-   *   associated value items
-   *
-   * @param[out] d_values_out
-   *   Device-accessible pointer to the correspondingly-reordered output
-   *   sequence of associated value items
-   *
-   * @param[in] num_items
-   *   The total number of items within the segmented array, including items not
-   *   covered by segments. `num_items` should match the largest element within
-   *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If
-   *   `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] begin_bit
-   *   **[optional]** The least-significant bit index (inclusive) needed for
-   *   key comparison
-   *
-   * @param[in] end_bit
-   *   **[optional]** The most-significant bit index (exclusive) needed for key
-   *   comparison (e.g., `sizeof(unsigned int) * 8`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order. (``~2N`` auxiliary storage required)
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //! 
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length `num_segments`, such that `d_begin_offsets[i]` is the first
+  //!   element of the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. If
+  //!   ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //! 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -307,150 +300,146 @@ struct DeviceSegmentedRadixSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into ascending order.
-   *        (`~N` auxiliary storage required)
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers and a corresponding
-   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-   *   structure that indicates which of the two buffers is "current" (and thus
-   *   contains the input data to be sorted).
-   * - The contents of both buffers within each pair may be altered by the
-   *   sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within each DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the number
-   *   of key bits specified and the targeted device architecture).
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is
-   *   specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
-   *   bits can be specified. This can reduce overall sorting overhead and yield
-   *   a corresponding performance improvement.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
-   *   `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.
-   * - @devicestorageP
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of `int` keys with associated vector of
-   * `int` values.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a set of DoubleBuffers to wrap pairs of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedRadixSort::SortPairs(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedRadixSort::SortPairs(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-   * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-   *
-   * @endcode
-   *
-   * @tparam KeyT
-   *   **[inferred]** Key type
-   *
-   * @tparam ValueT
-   *   **[inferred]** Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in,out] d_values
-   *   Double-buffer of values whose "current" device-accessible buffer
-   *   contains the unsorted input values and, upon return, is updated to point
-   *   to the sorted output values
-   *
-   * @param[in] num_items
-   *   The total number of items within the segmented array, including items not
-   *   covered by segments. `num_items` should match the largest element within
-   *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] begin_bit
-   *   **[optional]** The least-significant bit index (inclusive) needed for
-   *   key comparison
-   *
-   * @param[in] end_bit
-   *   **[optional]** The most-significant bit index (exclusive) needed for key
-   *   comparison (e.g., `sizeof(unsigned int) * 8`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required)
+  //! 
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers. Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits specified and the targeted device architecture).
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
+  //!   specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and yield
+  //!   a corresponding performance improvement.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of `int` keys with associated vector of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //! 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -525,141 +514,141 @@ struct DeviceSegmentedRadixSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into descending order.
-   *        (`~2N` auxiliary storage required).
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is
-   *   specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
-   *   bits can be specified. This can reduce overall sorting overhead and
-   *   yield a corresponding performance improvement.
-   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
-   *   not overlap `[in, in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see
-   *   the sorting interface using DoubleBuffer wrappers below.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of `int` keys with associated vector of
-   * `int` values.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedRadixSort::SortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedRadixSort::SortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-   * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   **[inferred]** Key type
-   *
-   * @tparam ValueT
-   *   **[inferred]** Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] d_values_in
-   *   Device-accessible pointer to the corresponding input sequence of
-   *   associated value items
-   *
-   * @param[out] d_values_out
-   *   Device-accessible pointer to the correspondingly-reordered output
-   *   sequence of associated value items
-   *
-   * @param[in] num_items
-   *   The total number of items within the segmented array, including items not
-   *   covered by segments. `num_items` should match the largest element within
-   *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup>
-   *   is considered empty.
-   *
-   * @param[in] begin_bit
-   *   **[optional]** The least-significant bit index (inclusive) needed for
-   *   key comparison
-   *
-   * @param[in] end_bit
-   *   **[optional]** The most-significant bit index (exclusive) needed for key
-   *   comparison (e.g., `sizeof(unsigned int) * 8`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required).
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
+  //!   specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and `out` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //! 
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //! 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -746,150 +735,150 @@ struct DeviceSegmentedRadixSort
                                                    stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into descending order.
-   *        (`~N` auxiliary storage required).
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers and a corresponding
-   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-   *   structure that indicates which of the two buffers is "current" (and thus
-   *   contains the input data to be sorted).
-   * - The contents of both buffers within each pair may be altered by the
-   *   sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within each DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the number
-   *   of key bits specified and the targeted device architecture).
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is
-   *   specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
-   *   bits can be specified. This can reduce overall sorting overhead and
-   *   yield a corresponding performance improvement.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
-   *   `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.
-   *   not to be modified.
-   * - @devicestorageP
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of `int` keys with associated vector of
-   * `int` values.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a set of DoubleBuffers to wrap pairs of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedRadixSort::SortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedRadixSort::SortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-   * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   **[inferred]** Key type
-   *
-   * @tparam ValueT
-   *   **[inferred]** Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in,out] d_values
-   *   Double-buffer of values whose "current" device-accessible buffer
-   *   contains the unsorted input values and, upon return, is updated to point
-   *   to the sorted output values
-   *
-   * @param[in] num_items
-   *   The total number of items within the segmented array, including items not
-   *   covered by segments. `num_items` should match the largest element within
-   *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup>
-   *   is considered empty.
-   *
-   * @param[in] begin_bit
-   *   **[optional]** The least-significant bit index (inclusive) needed for
-   *   key comparison
-   *
-   * @param[in] end_bit
-   *   **[optional]** The most-significant bit index (exclusive) needed for key
-   *   comparison (e.g., `sizeof(unsigned int) * 8`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required).
+  //! 
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits specified and the targeted device architecture).
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
+  //!   specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //!   not to be modified.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //! 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -966,129 +955,130 @@ struct DeviceSegmentedRadixSort
                                                    stream);
   }
 
-  //@}  end member group
-  /******************************************************************//**
-   * @name Keys-only
-   *********************************************************************/
-  //@{
+  //! @}  end member group
+  //! @name Keys-only
+  //! @{
 
 
-  /**
-   * @brief Sorts segments of keys into ascending order.
-   *        (`~2N` auxiliary storage required)
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
-   *   bits can be specified. This can reduce overall sorting overhead and
-   *   yield a corresponding performance improvement.
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter
-   *   is specified as `segment_offsets + 1`).
-   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see
-   *   the sorting interface using DoubleBuffer wrappers below.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
-   *   be accessed nor modified.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of `int` keys.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedRadixSort::SortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedRadixSort::SortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-   *
-   * @endcode
-   *
-   * @tparam KeyT
-   *   **[inferred]** Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of \p d_temp_storage allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] num_items
-   *   The total number of items within the segmented array, including items not
-   *   covered by segments. `num_items` should match the largest element within
-   *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] begin_bit
-   *   **[optional]** The least-significant bit index (inclusive) needed for
-   *   key comparison
-   *
-   * @param[in] end_bit
-   *   **[optional]** The most-significant bit index (exclusive) needed for key
-   *   comparison (e.g., `sizeof(unsigned int) * 8`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into ascending order. (``~2N`` auxiliary storage required)
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
+  //!   is specified as ``segment_offsets + 1``).
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of `int` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. 
+  //!   When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //! 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -1165,132 +1155,136 @@ struct DeviceSegmentedRadixSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers managed by a
-   *   DoubleBuffer structure that indicates which of the two buffers is
-   *   "current" (and thus contains the input data to be sorted).
-   * - The contents of both buffers may be altered by the sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within the DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the
-   *   number of key bits specified and the targeted device architecture).
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter
-   *   is specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
-   *   bits can be specified. This can reduce overall sorting overhead and
-   *   yield a corresponding performance improvement.
-   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
-   * - @devicestorageP
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of `int` keys.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a DoubleBuffer to wrap the pair of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedRadixSort::SortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedRadixSort::SortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-   *
-   * @endcode
-   *
-   * @tparam KeyT
-   *   **[inferred]** Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in] num_items
-   *   The total number of items within the segmented array, including items not
-   *   covered by segments. `num_items` should match the largest element within
-   *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1` <= d_begin_offsets[i]`, the *i*<sup>th</sup>
-   *   is considered empty.
-   *
-   * @param[in] begin_bit
-   *   **[optional]** The least-significant bit index (inclusive)
-   *   needed for key comparison
-   *
-   * @param[in] end_bit
-   *   **[optional]** The most-significant bit index (exclusive) needed for key
-   *   comparison (e.g., `sizeof(unsigned int) * 8`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required).
+  //! 
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
+  //!   is specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive)
+  //!   needed for key comparison
+  //! 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -1363,127 +1357,126 @@ struct DeviceSegmentedRadixSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into descending order.
-   * (`~2N` auxiliary storage required).
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter
-   *   is specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
-   *   bits can be specified. This can reduce overall sorting overhead and
-   *   yield a corresponding performance improvement.
-   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see
-   *   the sorting interface using DoubleBuffer wrappers below.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
-   *   be accessed nor modified.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of `int` keys.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a DoubleBuffer to wrap the pair of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedRadixSort::SortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedRadixSort::SortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-   *
-   * @endcode
-   *
-   * @tparam KeyT
-   *   **[inferred]** Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] num_items
-   *   The total number of items within the segmented array, including items not
-   *   covered by segments. `num_items` should match the largest element within
-   *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] begin_bit
-   *   **[optional]** The least-significant bit index (inclusive) needed for
-   *   key comparison
-   *
-   * @param[in] end_bit
-   *   **[optional]** The most-significant bit index (exclusive) needed for key
-   *   comparison (e.g., sizeof(unsigned int) * 8)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required).
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
+  //!   is specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //! 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., sizeof(unsigned int) * 8)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -1559,132 +1552,134 @@ struct DeviceSegmentedRadixSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into descending order.
-   * (`~N` auxiliary storage required).
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers managed by a
-   *   DoubleBuffer structure that indicates which of the two buffers is
-   *   "current" (and thus contains the input data to be sorted).
-   * - The contents of both buffers may be altered by the sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within the DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the
-   *   number of key bits specified and the targeted device architecture).
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
-   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
-   *   the latter is specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
-   *   bits can be specified. This can reduce overall sorting overhead and
-   *   yield a corresponding performance improvement.
-   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
-   * - @devicestorageP
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of `int` keys.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a DoubleBuffer to wrap the pair of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedRadixSort::SortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedRadixSort::SortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   **[inferred]** Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in] num_items
-   *   The total number of items within the segmented array, including items not
-   *   covered by segments. `num_items` should match the largest element within
-   *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i], the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] begin_bit
-   *   **[optional]** The least-significant bit index (inclusive) needed for
-   *   key comparison
-   *
-   * @param[in] end_bit
-   *   **[optional]** The most-significant bit index (exclusive) needed for key
-   *   comparison (e.g., `sizeof(unsigned int) * 8`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required).
+  //! 
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of `int` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //! 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -1757,9 +1752,7 @@ struct DeviceSegmentedRadixSort
       stream);
   }
 
-  //@}  end member group
+  //! @}  end member group
 };
 
 CUB_NAMESPACE_END
-
-
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
index a227a2f1e9..861d6c6b6b 100644
--- a/cub/cub/device/device_segmented_reduce.cuh
+++ b/cub/cub/device/device_segmented_reduce.cuh
@@ -26,11 +26,9 @@
  *
  ******************************************************************************/
 
-/**
- * @file cub::DeviceSegmentedReduce provides device-wide, parallel operations
- *       for computing a batched reduction across multiple sequences of data
- *       items residing within device-accessible memory.
- */
+//! @file cub::DeviceSegmentedReduce provides device-wide, parallel operations
+//!       for computing a batched reduction across multiple sequences of data
+//!       items residing within device-accessible memory.
 
 #pragma once
 
@@ -55,155 +53,155 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @brief DeviceSegmentedReduce provides device-wide, parallel operations for
- *        computing a reduction across multiple sequences of data items
- *        residing within device-accessible memory. ![](reduce_logo.png)
- * @ingroup SegmentedModule
- *
- * @par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)">*reduction*</a>
- * (or *fold*) uses a binary combining operator to compute a single aggregate
- * from a sequence of input elements.
- *
- * @par Usage Considerations
- * @cdp_class{DeviceSegmentedReduce}
- *
- */
+//! @rst 
+//! DeviceSegmentedReduce provides device-wide, parallel operations for
+//! computing a reduction across multiple sequences of data items
+//! residing within device-accessible memory. 
+//! 
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
+//! (or *fold*) uses a binary combining operator to compute a single aggregate
+//! from a sequence of input elements.
+//! 
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceSegmentedReduce}
+//!
+//! @endrst
 struct DeviceSegmentedReduce
 {
-  /**
-   * @brief Computes a device-wide segmented reduction using the specified
-   *        binary `reduction_op` functor.
-   *
-   * @par
-   * - Does not support binary reduction operators that are non-commutative.
-   * - Provides "run-to-run" determinism for pseudo-associative reduction
-   *   (e.g., addition of floating point types) on the same GPU device.
-   *   However, results for pseudo-associative reduction may be inconsistent
-   *   from one device to a another device of a different compute-capability
-   *   because CUB can employ different tile-sizing for different architectures.
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
-   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
-   *   the latter is specified as `segment_offsets + 1`).
-   * - Let `s` be in `[0, num_segments)`. The range
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
-   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates a custom min-reduction of a device
-   * vector of `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // CustomMin functor
-   * struct CustomMin
-   * {
-   *     template <typename T>
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     T operator()(const T &a, const T &b) const {
-   *         return (b < a) ? b : a;
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int          num_segments;   // e.g., 3
-   * int          *d_offsets;     // e.g., [0, 3, 3, 7]
-   * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int          *d_out;         // e.g., [-, -, -]
-   * CustomMin    min_op;
-   * int          initial_value;           // e.g., INT_MAX
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedReduce::Reduce(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *   num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run reduction
-   * cub::DeviceSegmentedReduce::Reduce(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *   num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-   *
-   * // d_out <-- [6, INT_MAX, 0]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced
-   *   aggregate \iterator
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @tparam ReductionOpT
-   *   **[inferred]** Binary reduction functor type having member
-   *   `T operator()(const T &a, const T &b)`
-   *
-   * @tparam T
-   *   **[inferred]** Data element type that is convertible to the `value` type
-   *   of `InputIteratorT`
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no
-   *   work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of \p d_temp_storage allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] reduction_op
-   *   Binary reduction functor
-   *
-   * @param[in] initial_value
-   *   Initial value of the reduction for each segment
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes a device-wide segmented reduction using the specified
+  //! binary ``reduction_op`` functor.
+  //! 
+  //! - Does not support binary reduction operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates a custom min-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_radix_sort.cuh>
+  //! 
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int          num_segments;   // e.g., 3
+  //!    int          *d_offsets;     // e.g., [0, 3, 3, 7]
+  //!    int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_out;         // e.g., [-, -, -]
+  //!    CustomMin    min_op;
+  //!    int          initial_value;           // e.g., INT_MAX
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedReduce::Reduce(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!      num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run reduction
+  //!    cub::DeviceSegmentedReduce::Reduce(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!      num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+  //! 
+  //!    // d_out <-- [6, INT_MAX, 0]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //! 
+  //! @tparam ReductionOpT
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //! 
+  //! @tparam T
+  //!   **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //! 
+  //! @param[in] initial_value
+  //!   Initial value of the reduction for each segment
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
@@ -279,108 +277,107 @@ struct DeviceSegmentedReduce
                      stream);
   }
 
-  /**
-   * @brief Computes a device-wide segmented sum using the addition (`+`)
-   *        operator.
-   *
-   * @par
-   * - Uses `0` as the initial value of the reduction for each segment.
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
-   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
-   *   the latter is specified as `segment_offsets + 1`).
-   * - Does not support `+` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
-   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the sum reduction of a device vector of
-   * `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int num_segments;   // e.g., 3
-   * int *d_offsets;     // e.g., [0, 3, 3, 7]
-   * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int *d_out;         // e.g., [-, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedReduce::Sum(
-   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *     num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sum-reduction
-   * cub::DeviceSegmentedReduce::Sum(
-   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *     num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_out <-- [21, 0, 17]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced aggregate
-   *   \iterator
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   **[optional]</b> CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes a device-wide segmented sum using the addition (``+``) operator.
+  //! 
+  //! - Uses ``0`` as the initial value of the reduction for each segment.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - Does not support ``+`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int num_segments;   // e.g., 3
+  //!    int *d_offsets;     // e.g., [0, 3, 3, 7]
+  //!    int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int *d_out;         // e.g., [-, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedReduce::Sum(
+  //!        d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!        num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sum-reduction
+  //!    cub::DeviceSegmentedReduce::Sum(
+  //!        d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!        num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_out <-- [21, 0, 17]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
+  //!   ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
@@ -446,109 +443,106 @@ struct DeviceSegmentedReduce
       stream);
   }
 
-  /**
-   * @brief Computes a device-wide segmented minimum using the less-than
-   *        (`<`) operator.
-   *
-   * @par
-   * - Uses `std::numeric_limits<T>::max()` as the initial value of the
-   *   reduction for each segment.
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is
-   *   specified as `segment_offsets + 1`).
-   * - Does not support `<` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
-   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the min-reduction of a device vector of
-   * `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int num_segments;   // e.g., 3
-   * int *d_offsets;     // e.g., [0, 3, 3, 7]
-   * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int *d_out;         // e.g., [-, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedReduce::Min(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *   num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run min-reduction
-   * cub::DeviceSegmentedReduce::Min(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *   num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_out <-- [6, INT_MAX, 0]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced
-   *   aggregate \iterator
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes a device-wide segmented minimum using the less-than (``<``) operator.
+  //! 
+  //! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction for each segment.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
+  //!   specified as ``segment_offsets + 1``).
+  //! - Does not support ``<`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int num_segments;   // e.g., 3
+  //!    int *d_offsets;     // e.g., [0, 3, 3, 7]
+  //!    int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int *d_out;         // e.g., [-, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedReduce::Min(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!      num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run min-reduction
+  //!    cub::DeviceSegmentedReduce::Min(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!      num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_out <-- [6, INT_MAX, 0]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
@@ -617,115 +611,116 @@ struct DeviceSegmentedReduce
       stream);
   }
 
-  /**
-   * @brief Finds the first device-wide minimum in each segment using the
-   *        less-than ('<') operator, also returning the in-segment index of
-   *        that item.
-   *
-   * @par
-   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>`
-   *   (assuming the value type of `d_in` is `T`)
-   *   - The minimum of the *i*<sup>th</sup> segment is written to
-   *     `d_out[i].value` and its offset in that segment is written to
-   *     `d_out[i].key`.
-   *   - The `{1, std::numeric_limits<T>::max()}` tuple is produced for
-   *     zero-length inputs
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter
-   *   is specified as `segment_offsets + 1`).
-   * - Does not support `<` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
-   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the argmin-reduction of a device vector
-   * of `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int                      num_segments;   // e.g., 3
-   * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-   * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedReduce::ArgMin(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *   num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run argmin-reduction
-   * cub::DeviceSegmentedReduce::ArgMin(
-   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *   num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input items
-   *   (of some type `T`) \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced aggregate
-   *   (having value type `KeyValuePair<int, T>`) \iterator
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the
-   *   *i*<sup>th</sup> is considered empty.
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Finds the first device-wide minimum in each segment using the
+  //! less-than (``<``) operator, also returning the in-segment index of that item.
+  //! 
+  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
+  //!   (assuming the value type of ``d_in`` is ``T``)
+  //!
+  //!   - The minimum of the *i*\ :sup:`th` segment is written to
+  //!     ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
+  //!   - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
+  //!
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
+  //!   is specified as ``segment_offsets + 1``).
+  //! - Does not support ``<`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int                      num_segments;   // e.g., 3
+  //!    int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+  //!    int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedReduce::ArgMin(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!      num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run argmin-reduction
+  //!    cub::DeviceSegmentedReduce::ArgMin(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!      num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate
+  //!   (having value type `KeyValuePair<int, T>`) @iterator
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
@@ -814,109 +809,106 @@ struct DeviceSegmentedReduce
       stream);
   }
 
-  /**
-   * @brief Computes a device-wide segmented maximum using the greater-than
-   *        (`>`) operator.
-   *
-   * @par
-   * - Uses `std::numeric_limits<T>::lowest()` as the initial value of the
-   *   reduction.
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
-   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
-   *   the latter is specified as `segment_offsets + 1`).
-   * - Does not support `>` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
-   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the max-reduction of a device vector
-   * of `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_radix_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int num_segments;   // e.g., 3
-   * int *d_offsets;     // e.g., [0, 3, 3, 7]
-   * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int *d_out;         // e.g., [-, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedReduce::Max(
-   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *     num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run max-reduction
-   * cub::DeviceSegmentedReduce::Max(
-   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *     num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_out <-- [8, INT_MIN, 9]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced
-   *   aggregate \iterator
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Computes a device-wide segmented maximum using the greater-than (``>``) operator.
+  //! 
+  //! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - Does not support ``>`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_radix_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int num_segments;   // e.g., 3
+  //!    int *d_offsets;     // e.g., [0, 3, 3, 7]
+  //!    int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int *d_out;         // e.g., [-, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedReduce::Max(
+  //!        d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!        num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run max-reduction
+  //!    cub::DeviceSegmentedReduce::Max(
+  //!        d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!        num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_out <-- [8, INT_MIN, 9]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
@@ -985,115 +977,119 @@ struct DeviceSegmentedReduce
       stream);
   }
 
-  /**
-   * @brief Finds the first device-wide maximum in each segment using the
-   *        greater-than ('>') operator, also returning the in-segment index of
-   *        that item
-   *
-   * @par
-   * - The output value type of `d_out` is `cub::KeyValuePair<int, T>`
-   *   (assuming the value type of `d_in` is `T`)
-   *   - The maximum of the *i*<sup>th</sup> segment is written to
-   *     `d_out[i].value` and its offset in that segment is written to
-   *     `d_out[i].key`.
-   *   - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for
-   *     zero-length inputs
-   * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
-   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
-   *   the latter is specified as `segment_offsets + 1`).
-   * - Does not support `>` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
-   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)`.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the argmax-reduction of a device vector
-   * of `int` data elements.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_reduce.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int                      num_segments;   // e.g., 3
-   * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-   * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedReduce::ArgMax(
-   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *     num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run argmax-reduction
-   * cub::DeviceSegmentedReduce::ArgMax(
-   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
-   *     num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input items
-   *   (of some type `T`) \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Output iterator type for recording the reduced aggregate
-   *   (having value type `KeyValuePair<int, T>`) \iterator
-   *
-   * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output aggregate
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Finds the first device-wide maximum in each segment using the
+  //! greater-than (``>``) operator, also returning the in-segment index of that item
+  //! 
+  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
+  //!   (assuming the value type of ``d_in`` is ``T``)
+  //!
+  //!   - The maximum of the *i*\ :sup:`th` segment is written to
+  //!     ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
+  //!   - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
+  //!
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - Does not support ``>`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the argmax-reduction of a device vector
+  //! of `int` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_reduce.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int                      num_segments;   // e.g., 3
+  //!    int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+  //!    int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedReduce::ArgMax(
+  //!        d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!        num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run argmax-reduction
+  //!    cub::DeviceSegmentedReduce::ArgMax(
+  //!        d_temp_storage, temp_storage_bytes, d_in, d_out,
+  //!        num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items
+  //!   (of some type `T`) @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate
+  //!   (having value type `KeyValuePair<int, T>`) @iterator
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length `num_segments`, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh
index a08d538b75..88346c7b97 100644
--- a/cub/cub/device/device_segmented_sort.cuh
+++ b/cub/cub/device/device_segmented_sort.cuh
@@ -25,12 +25,9 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * cub::DeviceSegmentedSort provides device-wide, parallel operations for
- * computing a batched sort across multiple, non-overlapping sequences of
- * data items residing within device-accessible memory.
- */
+//! @file cub::DeviceSegmentedSort provides device-wide, parallel operations for
+//!       computing a batched sort across multiple, non-overlapping sequences of
+//!       data items residing within device-accessible memory.
 
 #pragma once
 
@@ -51,190 +48,197 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @brief DeviceSegmentedSort provides device-wide, parallel operations for
- *        computing a batched sort across multiple, non-overlapping sequences of
- *        data items residing within device-accessible memory.
- *        ![](segmented_sorting_logo.png)
- * @ingroup SegmentedModule
- *
- * @par Overview
- * The algorithm arranges items into ascending (or descending) order.
- * The underlying sorting algorithm is undefined. Depending on the segment size,
- * it might be radix sort, merge sort or something else. Therefore, no
- * assumptions on the underlying implementation should be made.
- *
- * @par Differences from DeviceSegmentedRadixSort
- * DeviceSegmentedRadixSort is optimized for significantly large segments (tens
- * of thousands of items and more). Nevertheless, some domains produce a wide
- * range of segment sizes. DeviceSegmentedSort partitions segments into size
- * groups and specialize sorting algorithms for each group. This approach leads
- * to better resource utilization in the presence of segment size imbalance or
- * moderate segment sizes (up to thousands of items).
- * This algorithm is more complex and consists of multiple kernels. This fact
- * leads to longer compilation times as well as larger binaries sizes.
- *
- * @par Supported Types
- * The algorithm has to satisfy the underlying algorithms restrictions. Radix
- * sort usage restricts the list of supported types. Therefore,
- * DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types
- * (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` and
- * `__nv_bfloat16` 16-bit floating-point types.
- *
- * @par Segments are not required to be contiguous. Any element of input(s) or
- * output(s) outside the specified segments will not be accessed nor modified.
- *
- * @par A simple example
- * @code
- * #include <cub/cub.cuh>
- * // or equivalently <cub/device/device_segmented_sort.cuh>
- *
- * // Declare, allocate, and initialize device-accessible pointers
- * // for sorting data
- * int  num_items;          // e.g., 7
- * int  num_segments;       // e.g., 3
- * int  *d_offsets;         // e.g., [0, 3, 3, 7]
- * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
- * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
- * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
- * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
- * ...
- *
- * // Determine temporary device storage requirements
- * void     *d_temp_storage = NULL;
- * size_t   temp_storage_bytes = 0;
- * cub::DeviceSegmentedSort::SortPairs(
- *     d_temp_storage, temp_storage_bytes,
- *     d_keys_in, d_keys_out, d_values_in, d_values_out,
- *     num_items, num_segments, d_offsets, d_offsets + 1);
- *
- * // Allocate temporary storage
- * cudaMalloc(&d_temp_storage, temp_storage_bytes);
- *
- * // Run sorting operation
- * cub::DeviceSegmentedSort::SortPairs(
- *     d_temp_storage, temp_storage_bytes,
- *     d_keys_in, d_keys_out, d_values_in, d_values_out,
- *     num_items, num_segments, d_offsets, d_offsets + 1);
- *
- * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
- * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
- * @endcode
- */
+//! @rst
+//! DeviceSegmentedSort provides device-wide, parallel operations for
+//! computing a batched sort across multiple, non-overlapping sequences of
+//! data items residing within device-accessible memory.
+//! 
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The algorithm arranges items into ascending (or descending) order.
+//! The underlying sorting algorithm is undefined. Depending on the segment size,
+//! it might be radix sort, merge sort or something else. Therefore, no
+//! assumptions on the underlying implementation should be made.
+//! 
+//! Differences from DeviceSegmentedRadixSort
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! DeviceSegmentedRadixSort is optimized for significantly large segments (tens
+//! of thousands of items and more). Nevertheless, some domains produce a wide
+//! range of segment sizes. DeviceSegmentedSort partitions segments into size
+//! groups and specialize sorting algorithms for each group. This approach leads
+//! to better resource utilization in the presence of segment size imbalance or
+//! moderate segment sizes (up to thousands of items).
+//! This algorithm is more complex and consists of multiple kernels. This fact
+//! leads to longer compilation times as well as larger binaries sizes.
+//! 
+//! Supported Types
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The algorithm has to satisfy the underlying algorithms restrictions. Radix
+//! sort usage restricts the list of supported types. Therefore,
+//! DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types
+//! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half`` and
+//! ``__nv_bfloat16`` 16-bit floating-point types.
+//! 
+//! Segments are not required to be contiguous. Any element of input(s) or
+//! output(s) outside the specified segments will not be accessed nor modified.
+//! 
+//! A simple example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!    // or equivalently <cub/device/device_segmented_sort.cuh>
+//! 
+//!    // Declare, allocate, and initialize device-accessible pointers
+//!    // for sorting data
+//!    int  num_items;          // e.g., 7
+//!    int  num_segments;       // e.g., 3
+//!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+//!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+//!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+//!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+//!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+//!    ...
+//! 
+//!    // Determine temporary device storage requirements
+//!    void     *d_temp_storage = NULL;
+//!    size_t   temp_storage_bytes = 0;
+//!    cub::DeviceSegmentedSort::SortPairs(
+//!        d_temp_storage, temp_storage_bytes,
+//!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+//!        num_items, num_segments, d_offsets, d_offsets + 1);
+//! 
+//!    // Allocate temporary storage
+//!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//! 
+//!    // Run sorting operation
+//!    cub::DeviceSegmentedSort::SortPairs(
+//!        d_temp_storage, temp_storage_bytes,
+//!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+//!        num_items, num_segments, d_offsets, d_offsets + 1);
+//! 
+//!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+//!    // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+//! 
+//! @endrst
 struct DeviceSegmentedSort
 {
-
-  /*************************************************************************//**
-   * @name Keys-only
-   ****************************************************************************/
-  //@{
-
-  /**
-   * @brief Sorts segments of keys into ascending order. Approximately
-   *        `num_items + 2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation.
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - SortKeys is not guaranteed to be stable. That is, suppose that @p i and
-   *   @p j are equivalent: neither one is less than the other. It is not
-   *   guaranteed that the relative order of these two elements will be
-   *   preserved by sort.
-   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
-   *   be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible
-   * // pointers for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void    *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::SortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::SortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @name Keys-only
+  //! @{
+
+  //! @rst
+  //! Sorts segments of keys into ascending order. 
+  //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as `segment_offsets+1`).
+  //! - SortKeys is not guaranteed to be stable. That is, suppose that ``i`` and
+  //!   ``j`` are equivalent: neither one is less than the other. It is not
+  //!   guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys.
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible
+  //!    // pointers for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -303,112 +307,113 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into descending order. Approximately
-   *        `num_items + 2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation.
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments + 1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets + 1`).
-   * - SortKeysDescending is not guaranteed to be stable. That is, suppose that
-   *   @p i and @p j are equivalent: neither one is less than the other. It is
-   *   not guaranteed that the relative order of these two elements will be
-   *   preserved by sort.
-   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
-   *   be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void    *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::SortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::SortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no
-   *   work is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i] - 1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into descending order. Approximately
+  //! ``num_items + 2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
+  //!   ``i`` and ``j`` are equivalent: neither one is less than the other. It is
+  //!   not guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -477,122 +482,125 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into ascending order. Approximately
-   *        `2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers managed by a
-   *   DoubleBuffer structure that indicates which of the two buffers is
-   *   "current" (and thus contains the input data to be sorted).
-   * - The contents of both buffers may be altered by the sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within the DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the number
-   *   of key bits and the targeted device architecture).
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - SortKeys is not guaranteed to be stable. That is, suppose that
-   *   @p i and @p j are equivalent: neither one is less than the other. It is
-   *   not guaranteed that the relative order of these two elements will be
-   *   preserved by sort.
-   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible
-   * // pointers for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a DoubleBuffer to wrap the pair of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::SortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::SortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no
-   *   work is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*`
-   *   and `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i] - 1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into ascending order. Approximately ``2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets  +1``).
+  //! - SortKeys is not guaranteed to be stable. That is, suppose that
+  //!   ``i`` and ``j`` are equivalent: neither one is less than the other. It is
+  //!   not guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible
+  //!    // pointers for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -658,122 +666,126 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into descending order. Approximately
-   *        `2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers managed by a
-   *   DoubleBuffer structure that indicates which of the two buffers is
-   *   "current" (and thus contains the input data to be sorted).
-   * - The contents of both buffers may be altered by the sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within the DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the number
-   *   of key bits and the targeted device architecture).
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments + 1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets + 1`).
-   * - SortKeysDescending is not guaranteed to be stable. That is, suppose that
-   *   @p i and @p j are equivalent: neither one is less than the other. It is
-   *   not guaranteed that the relative order of these two elements will be
-   *   preserved by sort.
-   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a DoubleBuffer to wrap the pair of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::SortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::SortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i] - 1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1<= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into descending order. Approximately
+  //! ``2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
+  //!   ``i`` and ``j`` are equivalent: neither one is less than the other. It is
+  //!   not guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1<= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -839,113 +851,117 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into ascending order. Approximately
-   *        `num_items + 2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation.
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - StableSortKeys is stable: it preserves the relative ordering of
-   *   equivalent elements. That is, if @p x and @p y are elements such that
-   *   @p x precedes @p y, and if the two elements are equivalent (neither
-   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
-   *   @p x still precedes @p y.
-   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
-   *   be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void    *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::StableSortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::StableSortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into ascending order. Approximately
+  //! ``num_items +  2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortKeys is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -1002,113 +1018,117 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into descending order. Approximately
-   *        `num_items + 2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation.
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - StableSortKeysDescending is stable: it preserves the relative ordering of
-   *   equivalent elements. That is, if @p x and @p y are elements such that
-   *   @p x precedes @p y, and if the two elements are equivalent (neither
-   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
-   *   @p x still precedes @p y.
-   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
-   *   be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void    *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::StableSortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::StableSortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into descending order. 
+  //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortKeysDescending is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither ``x < y`` nor ``y < x``) 
+  //!   then a postcondition of stable sort is that ``x`` still precedes ``y``.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
+  //!   ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -1167,123 +1187,128 @@ struct DeviceSegmentedSort
                                                         stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into ascending order. Approximately
-   *        `2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers managed by a
-   *   DoubleBuffer structure that indicates which of the two buffers is
-   *   "current" (and thus contains the input data to be sorted).
-   * - The contents of both buffers may be altered by the sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within the DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the number
-   *   of key bits and the targeted device architecture).
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - StableSortKeys is stable: it preserves the relative ordering of
-   *   equivalent elements. That is, if @p x and @p y are elements such that
-   *   @p x precedes @p y, and if the two elements are equivalent (neither
-   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
-   *   @p x still precedes @p y.
-   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a DoubleBuffer to wrap the pair of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::StableSortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::StableSortKeys(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i] - 1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into ascending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortKeys is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -1336,123 +1361,127 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of keys into descending order. Approximately
-   *        `2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers managed by a
-   *   DoubleBuffer structure that indicates which of the two buffers is
-   *   "current" (and thus contains the input data to be sorted).
-   * - The contents of both buffers may be altered by the sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within the DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the number
-   *   of key bits and the targeted device architecture).
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - StableSortKeysDescending is stable: it preserves the relative ordering of
-   *   equivalent elements. That is, if @p x and @p y are elements such that
-   *   @p x precedes @p y, and if the two elements are equivalent (neither
-   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
-   *   @p x still precedes @p y.
-   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a DoubleBuffer to wrap the pair of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::StableSortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::StableSortKeysDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the
-   *   i-th segment is considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of keys into descending order. 
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortKeysDescending is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ```i`
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //! 
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
+  //!   ``d_values_*``. If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the
+  //!   ``i``-th segment is considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
@@ -1507,137 +1536,139 @@ struct DeviceSegmentedSort
                                                         stream);
   }
 
-  //@}  end member group
-  /*************************************************************************//**
-   * @name Key-value pairs
-   ****************************************************************************/
-  //@{
-
-  /**
-   * @brief Sorts segments of key-value pairs into ascending order.
-   *        Approximately `2*num_items + 2*num_segments` auxiliary storage
-   *        required.
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation.
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and
-   *   @p j are equivalent: neither one is less than the other. It is not
-   *   guaranteed that the relative order of these two elements will be
-   *   preserved by sort.
-   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
-   *   not overlap `[in, in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys with associated vector of
-   * @p int values.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::SortPairs(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::SortPairs(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-   * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam ValueT
-   *   <b>[inferred]</b> Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] d_values_in
-   *   Device-accessible pointer to the corresponding input sequence of
-   *   associated value items
-   *
-   * @param[out] d_values_out
-   *   Device-accessible pointer to the correspondingly-reordered output
-   *   sequence of associated value items
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @}  end member group
+  //! @name Key-value pairs
+  //! @{
+
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order.
+  //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
+  //!   ``j`` are equivalent: neither one is less than the other. It is not
+  //!   guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!    
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!    
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!    
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!    
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!    
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //! 
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -1714,130 +1745,135 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into descending order. Approximately
-   *        `2*num_items + 2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation.
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and
-   *   @p j are equivalent: neither one is less than the other. It is not
-   *   guaranteed that the relative order of these two elements will be
-   *   preserved by sort.
-   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
-   *   not overlap `[in, in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys with associated vector of
-   * @p int values.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void    *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::SortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::SortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-   * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam ValueT
-   *   <b>[inferred]</b> Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] d_values_in
-   *   Device-accessible pointer to the corresponding input sequence of
-   *   associated value items
-   *
-   * @param[out] d_values_out
-   *   Device-accessible pointer to the correspondingly-reordered output
-   *   sequence of associated value items
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order. 
+  //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
+  //!   ``j`` are equivalent: neither one is less than the other. It is not
+  //!   guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //! 
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -1916,140 +1952,144 @@ struct DeviceSegmentedSort
                                                    stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into ascending order.
-   *        Approximately `2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers and a corresponding
-   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-   *   structure that indicates which of the two buffers is "current" (and thus
-   *   contains the input data to be sorted).
-   * - The contents of both buffers within each pair may be altered by the sorting
-   *   operation.
-   * - Upon completion, the sorting operation will update the "current" indicator
-   *   within each DoubleBuffer wrapper to reference which of the two buffers
-   *   now contains the sorted output sequence (a function of the number of key bits
-   *   specified and the targeted device architecture).
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and
-   *   @p j are equivalent: neither one is less than the other. It is not
-   *   guaranteed that the relative order of these two elements will be
-   *   preserved by sort.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
-   *   `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys with associated vector of
-   * @p int values.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a set of DoubleBuffers to wrap pairs of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::SortPairs(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::SortPairs(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-   * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-   *
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam ValueT
-   *   <b>[inferred]</b> Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in,out] d_values
-   *   Double-buffer of values whose "current" device-accessible buffer contains
-   *   the unsorted input values and, upon return, is updated to point to the
-   *   sorted output values
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the sorting
+  //!   operation.
+  //! - Upon completion, the sorting operation will update the "current" indicator
+  //!   within each DoubleBuffer wrapper to reference which of the two buffers
+  //!   now contains the sorted output sequence (a function of the number of key bits
+  //!   specified and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
+  //!   ``j`` are equivalent: neither one is less than the other. It is not
+  //!   guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer contains
+  //!   the unsorted input values and, upon return, is updated to point to the
+  //!   sorted output values
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -2117,140 +2157,143 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into descending order.
-   *        Approximately `2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers and a corresponding
-   *   pair of associated value buffers. Each pair is managed by a DoubleBuffer
-   *   structure that indicates which of the two buffers is "current" (and thus
-   *   contains the input data to be sorted).
-   * - The contents of both buffers within each pair may be altered by the
-   *   sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within each DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the number
-   *   of key bits specified and the targeted device architecture).
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as <tt>segment_offsets+1</tt>).
-   * - SortPairsDescending is not guaranteed to be stable. That is, suppose that
-   *   @p i and @p j are equivalent: neither one is less than the other. It is
-   *   not guaranteed that the relative order of these two elements will be
-   *   preserved by sort.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
-   *   `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys with associated vector of
-   * @p int values.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for
-   * // sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a set of DoubleBuffers to wrap pairs of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::SortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::SortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-   * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
-   *
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam ValueT
-   *   <b>[inferred]</b> Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in,out] d_values
-   *   Double-buffer of values whose "current" device-accessible buffer contains
-   *   the unsorted input values and, upon return, is updated to point to the
-   *   sorted output values
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers. Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits specified and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortPairsDescending is not guaranteed to be stable. That is, suppose that
+  //!   ``i`` and ``j`` are equivalent: neither one is less than the other. It is
+  //!   not guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer contains
+  //!   the unsorted input values and, upon return, is updated to point to the
+  //!   sorted output values
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -2320,130 +2363,135 @@ struct DeviceSegmentedSort
                                                    stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into ascending order. Approximately
-   *        `2*num_items + 2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation.
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - StableSortPairs is stable: it preserves the relative ordering of
-   *   equivalent elements. That is, if @p x and @p y are elements such that
-   *   @p x precedes @p y, and if the two elements are equivalent (neither
-   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
-   *   @p x still precedes @p y.
-   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
-   *   not overlap `[in, in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys with associated vector of
-   * @p int values.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::StableSortPairs(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::StableSortPairs(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-   * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam ValueT
-   *   <b>[inferred]</b> Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When nullptr, the
-   *   required allocation size is written to @p temp_storage_bytes and no work is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] d_values_in
-   *   Device-accessible pointer to the corresponding input sequence of
-   *   associated value items
-   *
-   * @param[out] d_values_out
-   *   Device-accessible pointer to the correspondingly-reordered output
-   *   sequence of associated value items
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order. 
+  //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortPairs is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //! 
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -2512,132 +2560,135 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into descending order.
-   *        Approximately `2*num_items + 2*num_segments` auxiliary
-   *        storage required.
-   *
-   * @par
-   * - The contents of the input data are not altered by the sorting operation.
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - StableSortPairsDescending is stable: it preserves the relative ordering
-   *   of equivalent elements. That is, if @p x and @p y are elements such that
-   *   @p x precedes @p y, and if the two elements are equivalent (neither
-   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
-   *   @p x still precedes @p y.
-   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
-   *   not overlap `[in, in + num_items)`,
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys with associated vector of
-   * @p int values.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::StableSortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::StableSortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes,
-   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-   * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam ValueT
-   *   <b>[inferred]</b> Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in] d_keys_in
-   *   Device-accessible pointer to the input data of key data to sort
-   *
-   * @param[out] d_keys_out
-   *   Device-accessible pointer to the sorted output sequence of key data
-   *
-   * @param[in] d_values_in
-   *   Device-accessible pointer to the corresponding input sequence of
-   *   associated value items
-   *
-   * @param[out] d_values_out
-   *   Device-accessible pointer to the correspondingly-reordered output
-   *   sequence of associated value items
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order.
+  //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortPairsDescending is stable: it preserves the relative ordering
+  //!   of equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let `in` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //! 
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //! 
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //! 
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -2708,141 +2759,145 @@ struct DeviceSegmentedSort
                                                          stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into ascending order.
-   *        Approximately `2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers and a corresponding
-   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-   *   structure that indicates which of the two buffers is "current" (and thus
-   *   contains the input data to be sorted).
-   * - The contents of both buffers within each pair may be altered by the
-   *   sorting operation.
-   * - Upon completion, the sorting operation will update the "current"
-   *   indicator within each DoubleBuffer wrapper to reference which of the two
-   *   buffers now contains the sorted output sequence (a function of the number
-   *   of key bits specified and the targeted device architecture).
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - StableSortPairs is stable: it preserves the relative ordering
-   *   of equivalent elements. That is, if @p x and @p y are elements such that
-   *   @p x precedes @p y, and if the two elements are equivalent (neither
-   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
-   *   @p x still precedes @p y.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
-   *   `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys with associated vector of
-   * @p int values.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a set of DoubleBuffers to wrap pairs of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::StableSortPairs(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::StableSortPairs(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-   * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-   *
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam ValueT
-   *   <b>[inferred]</b> Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in,out] d_values
-   *   Double-buffer of values whose "current" device-accessible buffer contains
-   *   the unsorted input values and, upon return, is updated to point to the
-   *   sorted output values
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers. Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits specified and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortPairs is stable: it preserves the relative ordering
+  //!   of equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes `y`, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer contains
+  //!   the unsorted input values and, upon return, is updated to point to the
+  //!   sorted output values
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -2903,140 +2958,144 @@ struct DeviceSegmentedSort
       stream);
   }
 
-  /**
-   * @brief Sorts segments of key-value pairs into descending order.
-   *        Approximately `2*num_segments` auxiliary storage required.
-   *
-   * @par
-   * - The sorting operation is given a pair of key buffers and a corresponding
-   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-   *   structure that indicates which of the two buffers is "current" (and thus
-   *   contains the input data to be sorted).
-   * - The contents of both buffers within each pair may be altered by the sorting
-   *   operation.
-   * - Upon completion, the sorting operation will update the "current" indicator
-   *   within each DoubleBuffer wrapper to reference which of the two buffers
-   *   now contains the sorted output sequence (a function of the number of key bits
-   *   specified and the targeted device architecture).
-   * - When the input is a contiguous sequence of segments, a single sequence
-   *   @p segment_offsets (of length `num_segments+1`) can be aliased
-   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
-   *   the latter is specified as `segment_offsets+1`).
-   * - StableSortPairsDescending is stable: it preserves the relative ordering
-   *   of equivalent elements. That is, if @p x and @p y are elements such that
-   *   @p x precedes @p y, and if the two elements are equivalent (neither
-   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
-   *   @p x still precedes @p y.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
-   *   `[cur, cur + num_items)` shall not overlap
-   *   `[alt, alt + num_items)`. Both ranges shall not overlap
-   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
-   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i`
-   *   outside the specified segments `d_keys.Current()[i]`,
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.
-   *
-   * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments
-   * (with one zero-length segment) of @p int keys with associated vector of
-   * @p int values.
-   *
-   * @par
-   * @code
-   * #include <cub/cub.cuh>
-   * // or equivalently <cub/device/device_segmented_sort.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for sorting data
-   * int  num_items;          // e.g., 7
-   * int  num_segments;       // e.g., 3
-   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-   * ...
-   *
-   * // Create a set of DoubleBuffers to wrap pairs of device pointers
-   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedSort::StableSortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run sorting operation
-   * cub::DeviceSegmentedSort::StableSortPairsDescending(
-   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
-   *     num_items, num_segments, d_offsets, d_offsets + 1);
-   *
-   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-   * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
-   * @endcode
-   *
-   * @tparam KeyT
-   *   <b>[inferred]</b> Key type
-   *
-   * @tparam ValueT
-   *   <b>[inferred]</b> Value type
-   *
-   * @tparam BeginOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   beginning offsets \iterator
-   *
-   * @tparam EndOffsetIteratorT
-   *   <b>[inferred]</b> Random-access input iterator type for reading segment
-   *   ending offsets \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to @p temp_storage_bytes and no work
-   *   is done
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of @p d_temp_storage allocation
-   *
-   * @param[in,out] d_keys
-   *   Reference to the double-buffer of keys whose "current" device-accessible
-   *   buffer contains the unsorted input keys and, upon return, is updated to
-   *   point to the sorted output keys
-   *
-   * @param[in,out] d_values
-   *   Double-buffer of values whose "current" device-accessible buffer contains
-   *   the unsorted input values and, upon return, is updated to point to the
-   *   sorted output values
-   *
-   * @param[in] num_items
-   *   The total number of items to sort (across all segments)
-   *
-   * @param[in] num_segments
-   *   The number of segments that comprise the sorting data
-   *
-   * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of
-   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
-   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
-   *   `d_values_*`
-   *
-   * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length
-   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
-   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
-   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
-   *   considered empty.
-   *
-   * @param[in] stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
-   *   stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //! 
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the sorting
+  //!   operation.
+  //! - Upon completion, the sorting operation will update the "current" indicator
+  //!   within each DoubleBuffer wrapper to reference which of the two buffers
+  //!   now contains the sorted output sequence (a function of the number of key bits
+  //!   specified and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortPairsDescending is stable: it preserves the relative ordering
+  //!   of equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //! 
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //! 
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //! 
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //! 
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //! 
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //! 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer contains
+  //!   the unsorted input values and, upon return, is updated to point to the
+  //!   sorted output values
+  //! 
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //! 
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //! 
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //! 
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyT,
             typename ValueT,
             typename BeginOffsetIteratorT,
@@ -3099,8 +3158,7 @@ struct DeviceSegmentedSort
                                                          stream);
   }
 
-  //@}  end member group
-
+  //! @}  end member group
 };
 
 
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 48b6896293..a186be09c8 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -26,11 +26,9 @@
  *
  ******************************************************************************/
 
-/**
- * @file cub::DeviceSelect provides device-wide, parallel operations for
- *       compacting selected items from sequences of data items residing within
- *       device-accessible memory.
- */
+//! @file cub::DeviceSelect provides device-wide, parallel operations for
+//!       compacting selected items from sequences of data items residing within
+//!       device-accessible memory.
 
 #pragma once
 
@@ -54,137 +52,116 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @brief DeviceSelect provides device-wide, parallel operations for compacting
- *        selected items from sequences of data items residing within
- *        device-accessible memory. ![](select_logo.png)
- * @ingroup SingleModule
- *
- * @par Overview
- * These operations apply a selection criterion to selectively copy
- * items from a specified input sequence to a compact output sequence.
- *
- * @par Usage Considerations
- * @cdp_class{DeviceSelect}
- *
- * @par Performance
- * @linear_performance{select-flagged, select-if, and select-unique}
- *
- * @par
- * The following chart illustrates DeviceSelect::If performance across
- * different CUDA architectures for `int32` items, where 50% of the items are
- * randomly selected.
- *
- * @image html select_if_int32_50_percent.png
- *
- * @par
- * The following chart illustrates DeviceSelect::Unique performance across
- * different CUDA architectures for `int32` items where segments have lengths
- * uniformly sampled from `[1, 1000]`.
- *
- * @image html select_unique_int32_len_500.png
- *
- * @par
- * @plots_below
- *
- */
+//! @rst
+//! DeviceSelect provides device-wide, parallel operations for compacting
+//! selected items from sequences of data items residing within
+//! device-accessible memory.
+//! 
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! These operations apply a selection criterion to selectively copy
+//! items from a specified input sequence to a compact output sequence.
+//! 
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceSelect}
+//! 
+//! Performance
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @linear_performance{select-flagged, select-if, and select-unique}
+//! 
+//! @endrst
 struct DeviceSelect
 {
-  /**
-   * @brief Uses the `d_flags` sequence to selectively copy the corresponding
-   *        items from `d_in` into `d_out`. The total number of items selected
-   *        is written to `d_num_selected_out`. ![](select_flags_logo.png)
-   *
-   * @par
-   * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`,
-   *   `char`, `int`, etc.).
-   * - Copies of the selected items are compacted into `d_out` and maintain
-   *   their original relative ordering.
-   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap
-   *   `[d_in, d_in + num_items)`, `[d_flags, d_flags + num_items)` nor
-   *   `d_num_selected_out` in any way.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from
-   * an `int` device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for input,
-   * // flags, and output
-   * int  num_items;              // e.g., 8
-   * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-   * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-   * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-   * int  *d_num_selected_out;    // e.g., [ ]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSelect::Flagged(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_flags, d_out, d_num_selected_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run selection
-   * cub::DeviceSelect::Flagged(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_flags, d_out, d_num_selected_out, num_items);
-   *
-   * // d_out                 <-- [1, 4, 6, 7]
-   * // d_num_selected_out    <-- [4]
-   *
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam FlagIterator
-   *   **[inferred]** Random-access input iterator type for reading selection
-   *   flags \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing selected
-   *   items \iterator
-   *
-   * @tparam NumSelectedIteratorT
-   *   **[inferred]** Output iterator type for recording the number of items
-   *   selected \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[in] d_flags
-   *   Pointer to the input sequence of selection flags
-   *
-   * @param[out] d_out
-   *   Pointer to the output sequence of selected data items
-   *
-   * @param[out] d_num_selected_out
-   *   Pointer to the output total number of items selected
-   *   (i.e., length of `d_out`)
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Uses the ``d_flags`` sequence to selectively copy the corresponding items from ``d_in`` into ``d_out``. 
+  //! The total number of items selected is written to ``d_num_selected_out``. 
+  //! 
+  //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.).
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain their original relative ordering.
+  //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap ``[d_in, d_in + num_items)``, 
+  //!   | ``[d_flags, d_flags + num_items)`` nor ``d_num_selected_out`` in any way.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for input,
+  //!    // flags, and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+  //!    char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+  //!    int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_out, d_num_selected_out, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run selection
+  //!    cub::DeviceSelect::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_out, d_num_selected_out, num_items);
+  //! 
+  //!    // d_out                 <-- [1, 4, 6, 7]
+  //!    // d_num_selected_out    <-- [4]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam FlagIterator
+  //!   **[inferred]** Random-access input iterator type for reading selection flags @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected items @iterator
+  //! 
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[in] d_flags
+  //!   Pointer to the input sequence of selection flags
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of selected data items
+  //! 
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected (i.e., length of `d_out`)
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename FlagIterator,
             typename OutputIteratorT,
@@ -253,92 +230,85 @@ struct DeviceSelect
                                          stream);
   }
 
-  /**
-   * @brief Uses the `d_flags` sequence to selectively compact the items in
-   *        `d_data`. The total number of items selected is written to
-   *        `d_num_selected_out`. ![](select_flags_logo.png)
-   *
-   * @par
-   * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`,
-   *   `char`, `int`, etc.).
-   * - Copies of the selected items are compacted in-place and maintain
-   *   their original relative ordering.
-   * - The `d_data` may equal `d_flags`. The range
-   *  `[d_data, d_data + num_items)` shall not overlap
-   *  `[d_flags, d_flags + num_items)` in any other way.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from
-   * an `int` device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers for input,
-   * // flags, and output
-   * int  num_items;              // e.g., 8
-   * int  *d_data;                // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-   * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-   * int  *d_num_selected_out;    // e.g., [ ]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSelect::Flagged(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_flags, d_num_selected_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run selection
-   * cub::DeviceSelect::Flagged(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_flags, d_num_selected_out, num_items);
-   *
-   * // d_data                <-- [1, 4, 6, 7]
-   * // d_num_selected_out    <-- [4]
-   *
-   * @endcode
-   *
-   * @tparam IteratorT
-   *   **[inferred]** Random-access iterator type for reading and writing
-   *   selected items \iterator
-   *
-   * @tparam FlagIterator
-   *   **[inferred]** Random-access input iterator type for reading selection
-   *   flags \iterator
-   *
-   * @tparam NumSelectedIteratorT
-   *   **[inferred]** Output iterator type for recording the number of items
-   *   selected \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_data
-   *   Pointer to the sequence of data items
-   *
-   * @param[in] d_flags
-   *   Pointer to the input sequence of selection flags
-   *
-   * @param[out] d_num_selected_out
-   *   Pointer to the output total number of items selected
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_data`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``. 
+  //! The total number of items selected is written to ``d_num_selected_out``. 
+  //! 
+  //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.).
+  //! - Copies of the selected items are compacted in-place and maintain their original relative ordering.
+  //! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap
+  //!   | ``[d_flags, d_flags + num_items)`` in any other way.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers for input,
+  //!    // flags, and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_data;                // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+  //!    char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_num_selected_out, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run selection
+  //!    cub::DeviceSelect::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_num_selected_out, num_items);
+  //! 
+  //!    // d_data                <-- [1, 4, 6, 7]
+  //!    // d_num_selected_out    <-- [4]
+  //! 
+  //! @endrst
+  //! 
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access iterator type for reading and writing selected items @iterator
+  //! 
+  //! @tparam FlagIterator
+  //!   **[inferred]** Random-access input iterator type for reading selection flags @iterator
+  //! 
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_data
+  //!   Pointer to the sequence of data items
+  //! 
+  //! @param[in] d_flags
+  //!   Pointer to the input sequence of selection flags
+  //! 
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_data`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename IteratorT,
             typename FlagIterator,
             typename NumSelectedIteratorT>
@@ -403,125 +373,107 @@ struct DeviceSelect
       stream);
   }
 
-  /**
-   * @brief Uses the `select_op` functor to selectively copy items from `d_in`
-   *        into `d_out`. The total number of items selected is written to
-   *        `d_num_selected_out`. ![](select_logo.png)
-   *
-   * @par
-   * - Copies of the selected items are compacted into `d_out` and maintain
-   *   their original relative ordering.
-   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap
-   *   `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
-   * - @devicestorage
-   *
-   * @par Performance
-   * The following charts illustrate saturated select-if performance across
-   * different CUDA architectures for `int32` and `int64` items, respectively.
-   * Items are selected with 50% probability.
-   *
-   * @image html select_if_int32_50_percent.png
-   * @image html select_if_int64_50_percent.png
-   *
-   * @par
-   * The following charts are similar, but 5% selection probability:
-   *
-   * @image html select_if_int32_5_percent.png
-   * @image html select_if_int64_5_percent.png
-   *
-   * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from
-   * an `int` device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
-   *
-   * // Functor type for selecting values less than some criteria
-   * struct LessThan
-   * {
-   *     int compare;
-   *
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     LessThan(int compare) : compare(compare) {}
-   *
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     bool operator()(const int &a) const {
-   *         return (a < compare);
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int      num_items;              // e.g., 8
-   * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-   * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-   * int      *d_num_selected_out;    // e.g., [ ]
-   * LessThan select_op(7);
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSelect::If(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, d_num_selected_out, num_items, select_op);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run selection
-   * cub::DeviceSelect::If(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, d_num_selected_out, num_items, select_op);
-   *
-   * // d_out                 <-- [0, 2, 3, 5, 2]
-   * // d_num_selected_out    <-- [5]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing selected
-   *   items \iterator
-   *
-   * @tparam NumSelectedIteratorT
-   *   **[inferred]** Output iterator type for recording the number of items
-   *   selected \iterator
-   *
-   * @tparam SelectOp
-   *   **[inferred]** Selection operator type having member
-   *   `bool operator()(const T &a)`
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output sequence of selected data items
-   *
-   * @param[out] d_num_selected_out
-   *   Pointer to the output total number of items selected
-   *   (i.e., length of `d_out`)
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param[in] select_op
-   *   Unary selection operator
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``. 
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //! 
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain
+  //!   their original relative ordering.
+  //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap
+  //!   | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //! 
+  //!    // Functor type for selecting values less than some criteria
+  //!    struct LessThan
+  //!    {
+  //!        int compare;
+  //! 
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        LessThan(int compare) : compare(compare) {}
+  //! 
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        bool operator()(const int &a) const {
+  //!            return (a < compare);
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int      num_items;              // e.g., 8
+  //!    int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+  //!    int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int      *d_num_selected_out;    // e.g., [ ]
+  //!    LessThan select_op(7);
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::If(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, d_num_selected_out, num_items, select_op);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run selection
+  //!    cub::DeviceSelect::If(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, d_num_selected_out, num_items, select_op);
+  //! 
+  //!    // d_out                 <-- [0, 2, 3, 5, 2]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected items @iterator
+  //! 
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //! 
+  //! @tparam SelectOp
+  //!   **[inferred]** Selection operator type having member `bool operator()(const T &a)`
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of selected data items
+  //! 
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //!   (i.e., length of `d_out`)
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] select_op
+  //!   Unary selection operator
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename NumSelectedIteratorT,
@@ -588,100 +540,97 @@ struct DeviceSelect
       stream);
   }
 
-  /**
-   * @brief Uses the `select_op` functor to selectively compact items in
-   *        `d_data`. The total number of items selected is written to
-   *        `d_num_selected_out`. ![](select_logo.png)
-   *
-   * @par
-   * - Copies of the selected items are compacted in `d_data` and maintain
-   *   their original relative ordering.
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from
-   * an `int` device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
-   *
-   * // Functor type for selecting values less than some criteria
-   * struct LessThan
-   * {
-   *     int compare;
-   *
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     LessThan(int compare) : compare(compare) {}
-   *
-   *     CUB_RUNTIME_FUNCTION __forceinline__
-   *     bool operator()(const int &a) const {
-   *         return (a < compare);
-   *     }
-   * };
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int      num_items;              // e.g., 8
-   * int      *d_data;                // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-   * int      *d_num_selected_out;    // e.g., [ ]
-   * LessThan select_op(7);
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSelect::If(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, d_num_selected_out, num_items, select_op);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run selection
-   * cub::DeviceSelect::If(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_data, d_num_selected_out, num_items, select_op);
-   *
-   * // d_data                <-- [0, 2, 3, 5, 2]
-   * // d_num_selected_out    <-- [5]
-   * @endcode
-   *
-   * @tparam IteratorT
-   *   **[inferred]** Random-access input iterator type for reading and
-   *   writing items \iterator
-   *
-   * @tparam NumSelectedIteratorT
-   *   **[inferred]** Output iterator type for recording the number of items
-   *   selected \iterator
-   *
-   * @tparam SelectOp
-   *   **[inferred]** Selection operator type having member
-   *   `bool operator()(const T &a)`
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in,out] d_data
-   *   Pointer to the sequence of data items
-   *
-   * @param[out] d_num_selected_out
-   *   Pointer to the output total number of items selected
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_data`)
-   *
-   * @param[in] select_op
-   *   Unary selection operator
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Uses the ``select_op`` functor to selectively compact items in ``d_data``. 
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //! 
+  //! - | Copies of the selected items are compacted in ``d_data`` and maintain
+  //!   | their original relative ordering.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //! 
+  //!    // Functor type for selecting values less than some criteria
+  //!    struct LessThan
+  //!    {
+  //!        int compare;
+  //! 
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        LessThan(int compare) : compare(compare) {}
+  //! 
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        bool operator()(const int &a) const {
+  //!            return (a < compare);
+  //!        }
+  //!    };
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int      num_items;              // e.g., 8
+  //!    int      *d_data;                // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+  //!    int      *d_num_selected_out;    // e.g., [ ]
+  //!    LessThan select_op(7);
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::If(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, d_num_selected_out, num_items, select_op);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run selection
+  //!    cub::DeviceSelect::If(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, d_num_selected_out, num_items, select_op);
+  //! 
+  //!    // d_data                <-- [0, 2, 3, 5, 2]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading and writing items @iterator
+  //! 
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //! 
+  //! @tparam SelectOp
+  //!   **[inferred]** Selection operator type having member `bool operator()(const T &a)`
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in,out] d_data
+  //!   Pointer to the sequence of data items
+  //! 
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_data`)
+  //! 
+  //! @param[in] select_op
+  //!   Unary selection operator
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename IteratorT,
             typename NumSelectedIteratorT,
             typename SelectOp>
@@ -745,107 +694,87 @@ struct DeviceSelect
                                                          stream);
   }
 
-  /**
-   * @brief Given an input sequence `d_in` having runs of consecutive
-   *        equal-valued keys, only the first key from each run is selectively
-   *        copied to `d_out`. The total number of items selected is written to
-   *        `d_num_selected_out`. ![](unique_logo.png)
-   *
-   * @par
-   * - The `==` equality operator is used to determine whether keys are
-   *   equivalent
-   * - Copies of the selected items are compacted into `d_out` and maintain
-   *   their original relative ordering.
-   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap
-   *   `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
-   * - @devicestorage
-   *
-   * @par Performance
-   * The following charts illustrate saturated select-unique performance across different
-   * CUDA architectures for `int32` and `int64` items, respectively. Segments
-   * have lengths uniformly sampled from `[1, 1000]`.
-   *
-   * @image html select_unique_int32_len_500.png
-   * @image html select_unique_int64_len_500.png
-   *
-   * @par
-   * The following charts are similar, but with segment lengths uniformly
-   * sampled from `[1, 10]`:
-   *
-   * @image html select_unique_int32_len_5.png
-   * @image html select_unique_int64_len_5.png
-   *
-   * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from
-   * an `int` device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int  num_items;              // e.g., 8
-   * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-   * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-   * int  *d_num_selected_out;    // e.g., [ ]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSelect::Unique(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, d_num_selected_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run selection
-   * cub::DeviceSelect::Unique(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_in, d_out, d_num_selected_out, num_items);
-   *
-   * // d_out                 <-- [0, 2, 9, 5, 8]
-   * // d_num_selected_out    <-- [5]
-   * @endcode
-   *
-   * @tparam InputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   items \iterator
-   *
-   * @tparam OutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing selected
-   *   items \iterator
-   *
-   * @tparam NumSelectedIteratorT
-   *   **[inferred]** Output iterator type for recording the number of items
-   *   selected \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_in
-   *   Pointer to the input sequence of data items
-   *
-   * @param[out] d_out
-   *   Pointer to the output sequence of selected data items
-   *
-   * @param[out] d_num_selected_out
-   *   Pointer to the output total number of items selected
-   *   (i.e., length of `d_out`)
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Given an input sequence ``d_in`` having runs of consecutive equal-valued keys, 
+  //! only the first key from each run is selectively copied to ``d_out``. 
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //! 
+  //! - The ``==`` equality operator is used to determine whether keys are equivalent
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain their original relative ordering.
+  //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap
+  //!   | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way.
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::Unique(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, d_num_selected_out, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run selection
+  //!    cub::DeviceSelect::Unique(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, d_num_selected_out, num_items);
+  //! 
+  //!    // d_out                 <-- [0, 2, 9, 5, 8]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //! 
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected items @iterator
+  //! 
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //! 
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of selected data items
+  //! 
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //!   (i.e., length of `d_out`)
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename NumSelectedIteratorT>
@@ -908,118 +837,112 @@ struct DeviceSelect
       stream);
   }
 
-  /**
-   * @brief Given an input sequence `d_keys_in` and `d_values_in` with runs of
-   *        key-value pairs with consecutive equal-valued keys, only the first
-   *        key and its value from each run is selectively copied to
-   *        `d_keys_out` and `d_values_out`. The total number of items selected
-   *        is written to `d_num_selected_out`. ![](unique_logo.png)
-   *
-   * @par
-   * - The `==` equality operator is used to determine whether keys are
-   *   equivalent
-   * - Copies of the selected items are compacted into `d_out` and maintain
-   *   their original relative ordering.
-   * - In-place operations are not supported. There must be no overlap between
-   *   any of the provided ranges:
-   *   - `[d_keys_in,          d_keys_in    + num_items)`
-   *   - `[d_keys_out,         d_keys_out   + *d_num_selected_out)`
-   *   - `[d_values_in,        d_values_in  + num_items)`
-   *   - `[d_values_out,       d_values_out + *d_num_selected_out)`
-   *   - `[d_num_selected_out, d_num_selected_out + 1)`
-   * - @devicestorage
-   *
-   * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from
-   * an `int` device vector.
-   * @par
-   * @code
-   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
-   *
-   * // Declare, allocate, and initialize device-accessible pointers
-   * // for input and output
-   * int  num_items;              // e.g., 8
-   * int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-   * int  *d_values_in;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-   * int  *d_keys_out;            // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-   * int  *d_values_out;          // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-   * int  *d_num_selected_out;    // e.g., [ ]
-   * ...
-   *
-   * // Determine temporary device storage requirements
-   * void     *d_temp_storage = NULL;
-   * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSelect::UniqueByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in,
-   *   d_keys_out, d_values_out, d_num_selected_out, num_items);
-   *
-   * // Allocate temporary storage
-   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-   *
-   * // Run selection
-   * cub::DeviceSelect::UniqueByKey(
-   *   d_temp_storage, temp_storage_bytes,
-   *   d_keys_in, d_values_in,
-   *   d_keys_out, d_values_out, d_num_selected_out, num_items);
-   *
-   * // d_keys_out            <-- [0, 2, 9, 5, 8]
-   * // d_values_out          <-- [1, 2, 4, 5, 8]
-   * // d_num_selected_out    <-- [5]
-   * @endcode
-   *
-   * @tparam KeyInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   keys \iterator
-   *
-   * @tparam ValueInputIteratorT
-   *   **[inferred]** Random-access input iterator type for reading input
-   *   values \iterator
-   *
-   * @tparam KeyOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing selected
-   *   keys \iterator
-   *
-   * @tparam ValueOutputIteratorT
-   *   **[inferred]** Random-access output iterator type for writing selected
-   *   values \iterator
-   *
-   * @tparam NumSelectedIteratorT
-   *   **[inferred]** Output iterator type for recording the number of items
-   *   selected \iterator
-   *
-   * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the
-   *   required allocation size is written to `temp_storage_bytes` and no work
-   *   is done.
-   *
-   * @param[in,out] temp_storage_bytes
-   *   Reference to size in bytes of `d_temp_storage` allocation
-   *
-   * @param[in] d_keys_in
-   *   Pointer to the input sequence of keys
-   *
-   * @param[in] d_values_in
-   *   Pointer to the input sequence of values
-   *
-   * @param[out] d_keys_out
-   *   Pointer to the output sequence of selected keys
-   *
-   * @param[out] d_values_out
-   *   Pointer to the output sequence of selected values
-   *
-   * @param[out] d_num_selected_out
-   *   Pointer to the total number of items selected (i.e., length of
-   *   `d_keys_out` or `d_values_out`)
-   *
-   * @param[in] num_items
-   *   Total number of input items (i.e., length of `d_keys_in` or
-   *   `d_values_in`)
-   *
-   * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.
-   *   Default is stream<sub>0</sub>.
-   */
+  //! @rst
+  //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive 
+  //! equal-valued keys, only the first key and its value from each run is selectively copied 
+  //! to ``d_keys_out`` and ``d_values_out``. 
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //! 
+  //! - The ``==`` equality operator is used to determine whether keys are equivalent
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain
+  //!   their original relative ordering.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   - ``[d_keys_in,          d_keys_in    + num_items)``
+  //!   - ``[d_keys_out,         d_keys_out   + *d_num_selected_out)``
+  //!   - ``[d_values_in,        d_values_in  + num_items)``
+  //!   - ``[d_values_out,       d_values_out + *d_num_selected_out)``
+  //!   - ``[d_num_selected_out, d_num_selected_out + 1)``
+  //!
+  //! - @devicestorage
+  //! 
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //! 
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int  *d_values_in;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+  //!    int  *d_keys_out;            // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_values_out;          // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //! 
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = NULL;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::UniqueByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in,
+  //!      d_keys_out, d_values_out, d_num_selected_out, num_items);
+  //! 
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //! 
+  //!    // Run selection
+  //!    cub::DeviceSelect::UniqueByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in,
+  //!      d_keys_out, d_values_out, d_num_selected_out, num_items);
+  //! 
+  //!    // d_keys_out            <-- [0, 2, 9, 5, 8]
+  //!    // d_values_out          <-- [1, 2, 4, 5, 8]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //! 
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //! 
+  //! @tparam ValueInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //! 
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected keys @iterator
+  //! 
+  //! @tparam ValueOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected values @iterator
+  //! 
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //! 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //! 
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //! 
+  //! @param[in] d_keys_in
+  //!   Pointer to the input sequence of keys
+  //! 
+  //! @param[in] d_values_in
+  //!   Pointer to the input sequence of values
+  //! 
+  //! @param[out] d_keys_out
+  //!   Pointer to the output sequence of selected keys
+  //! 
+  //! @param[out] d_values_out
+  //!   Pointer to the output sequence of selected values
+  //! 
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
+  //! 
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
+  //! 
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
   template <typename KeyInputIteratorT,
             typename ValueInputIteratorT,
             typename KeyOutputIteratorT,
@@ -1093,12 +1016,4 @@ struct DeviceSelect
   }
 };
 
-/**
- * @example example_device_select_flagged.cu
- * @example example_device_select_if.cu
- * @example example_device_select_unique.cu
- */
-
 CUB_NAMESPACE_END
-
-
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
index 04ee5334a9..c6d364b5a8 100644
--- a/cub/cub/device/device_spmv.cuh
+++ b/cub/cub/device/device_spmv.cuh
@@ -27,11 +27,8 @@
  *
  ******************************************************************************/
 
-/**
- * @file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector
- * multiplication (SpMV).
- */
+//! @file cub::DeviceSpmv provides device-wide parallel operations for performing 
+//!       sparse-matrix * vector multiplication (SpMV).
 
 #pragma once
 
@@ -56,128 +53,125 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @brief DeviceSpmv provides device-wide parallel operations for performing
- *        sparse-matrix * dense-vector multiplication (SpMV).
- *
- * @ingroup SingleModule
- *
- * @par Overview
- * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
- * performs the matrix-vector operation
- * <em>y</em> = <b>A</b>*<em>x</em> + <em>y</em>,
- * where:
- *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
- *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
- *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
- *  - <em>x</em> and <em>y</em> are dense vectors
- *
- * @par Usage Considerations
- * @cdp_class{DeviceSpmv}
- *
- */
+//! @rst
+//! DeviceSpmv provides device-wide parallel operations for performing
+//! sparse-matrix * dense-vector multiplication (SpMV).
+//! 
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The `SpMV computation <http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication>`_
+//! performs the matrix-vector operation ``y = A * x + y``, where:
+//!
+//!  - ``A`` is an ``m * n`` sparse matrix whose non-zero structure is specified in
+//!    `compressed-storage-row (CSR) format <http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29>`_
+//!    (i.e., three arrays: ``values``, ``row_offsets``, and ``column_indices``)
+//!  - ``x`` and ``y`` are dense vectors
+//! 
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceSpmv}
+//! 
+//! @endrst
 struct DeviceSpmv
 {
-    /******************************************************************//**
-     * @name CSR matrix operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * @brief This function performs the matrix-vector operation
-     *        <em>y</em> = <b>A</b>*<em>x</em>.
-     *
-     * @par Snippet
-     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
-     * representing a 3x3 lattice (24 non-zeros).
-     *
-     * @par
-     * @code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input
-     * vector x,
-     * // and output vector y
-     * int    num_rows = 9;
-     * int    num_cols = 9;
-     * int    num_nonzeros = 24;
-     *
-     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
-     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
-     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
-     *
-     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
-     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
-     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
-     *
-     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
-     *
-     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
-     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-     *     num_rows, num_cols, num_nonzeros);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run SpMV
-     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-     *     num_rows, num_cols, num_nonzeros);
-     *
-     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
-     *
-     * @endcode
-     *
-     * @tparam ValueT
-     *   <b>[inferred]</b> Matrix and vector value type (e.g., @p float, @p double, etc.)
-     *
-     * @param[in] d_temp_storage
-     *   Device-accessible allocation of temporary storage.
-     *   When NULL, the required allocation size is written to @p temp_storage_bytes
-     *   and no work is done.
-     *
-     * @param[in,out] temp_storage_bytes
-     *   Reference to size in bytes of @p d_temp_storage allocation
-     *
-     * @param[in] d_values
-     *   Pointer to the array of @p num_nonzeros values of the corresponding nonzero elements
-     *   of matrix <b>A</b>.
-     *
-     * @param[in] d_row_offsets
-     *   Pointer to the array of @p m + 1 offsets demarcating the start of every row in
-     *   @p d_column_indices and @p d_values (with the final entry being equal to @p num_nonzeros)
-     *
-     * @param[in] d_column_indices
-     *   Pointer to the array of @p num_nonzeros column-indices of the corresponding nonzero
-     *   elements of matrix <b>A</b>.  (Indices are zero-valued.)
-     *
-     * @param[in] d_vector_x
-     *   Pointer to the array of @p num_cols values corresponding to the dense input vector
-     * <em>x</em>
-     *
-     * @param[out] d_vector_y
-     *   Pointer to the array of @p num_rows values corresponding to the dense output vector
-     * <em>y</em>
-     *
-     * @param[in] num_rows
-     *   number of rows of matrix <b>A</b>.
-     *
-     * @param[in] num_cols
-     *   number of columns of matrix <b>A</b>.
-     *
-     * @param[in] num_nonzeros
-     *   number of nonzero elements of matrix <b>A</b>.
-     *
-     * @param[in] stream
-     *   <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-     */
+    //! @name CSR matrix operations
+    //! @{
+
+    //! @rst
+    //! This function performs the matrix-vector operation ``y = A*x``.
+    //! 
+    //! Snippet
+    //! +++++++++++++++++++++++++++++++++++++++++++++
+    //!
+    //! The code snippet below illustrates SpMV upon a 9x9 CSR matrix ``A`` representing a 3x3 lattice (24 non-zeros).
+    //!
+    //! .. code-block:: c++
+    //! 
+    //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
+    //! 
+    //!    // Declare, allocate, and initialize device-accessible pointers for input matrix A, input
+    //!    vector x,
+    //!    // and output vector y
+    //!    int    num_rows = 9;
+    //!    int    num_cols = 9;
+    //!    int    num_nonzeros = 24;
+    //! 
+    //!    float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
+    //!                      //        1, 1, 1, 1, 1, 1, 1, 1,
+    //!                      //        1, 1, 1, 1, 1, 1, 1, 1]
+    //! 
+    //!    int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
+    //!                             //        4, 6, 1, 3, 5, 7, 2, 4,
+    //!                             //        8, 3, 7, 4, 6, 8, 5, 7]
+    //! 
+    //!    int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
+    //! 
+    //!    float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
+    //!    float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
+    //!    ...
+    //! 
+    //!    // Determine temporary device storage requirements
+    //!    void*    d_temp_storage = NULL;
+    //!    size_t   temp_storage_bytes = 0;
+    //!    cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+    //!        d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+    //!        num_rows, num_cols, num_nonzeros);
+    //! 
+    //!    // Allocate temporary storage
+    //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+    //! 
+    //!    // Run SpMV
+    //!    cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+    //!        d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+    //!        num_rows, num_cols, num_nonzeros);
+    //! 
+    //!    // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
+    //! 
+    //! @endrst
+    //! 
+    //! @tparam ValueT
+    //!   **[inferred]** Matrix and vector value type (e.g., `float`, `double`, etc.)
+    //! 
+    //! @param[in] d_temp_storage
+    //!   Device-accessible allocation of temporary storage.
+    //!   When NULL, the required allocation size is written to `temp_storage_bytes` and no work is done.
+    //! 
+    //! @param[in,out] temp_storage_bytes
+    //!   Reference to size in bytes of `d_temp_storage` allocation
+    //! 
+    //! @param[in] d_values
+    //!   Pointer to the array of `num_nonzeros` values of the corresponding nonzero elements
+    //!   of matrix `A`.
+    //! 
+    //! @param[in] d_row_offsets
+    //!   Pointer to the array of `m + 1` offsets demarcating the start of every row in
+    //!   `d_column_indices` and `d_values` (with the final entry being equal to `num_nonzeros`)
+    //! 
+    //! @param[in] d_column_indices
+    //!   Pointer to the array of `num_nonzeros` column-indices of the corresponding nonzero
+    //!   elements of matrix `A`. (Indices are zero-valued.)
+    //! 
+    //! @param[in] d_vector_x
+    //!   Pointer to the array of `num_cols` values corresponding to the dense input vector `x`
+    //! 
+    //! @param[out] d_vector_y
+    //!   Pointer to the array of `num_rows` values corresponding to the dense output vector `y`
+    //! 
+    //! @param[in] num_rows
+    //!   number of rows of matrix `A`.
+    //! 
+    //! @param[in] num_cols
+    //!   number of columns of matrix `A`.
+    //! 
+    //! @param[in] num_nonzeros
+    //!   number of nonzero elements of matrix `A`.
+    //! 
+    //! @param[in] stream
+    //!   @rst
+    //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+    //!   @endrst
     template <typename ValueT>
     CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(void *d_temp_storage,
                                                   size_t &temp_storage_bytes,
@@ -240,7 +234,7 @@ struct DeviceSpmv
                            stream);
     }
 
-    //@}  end member group
+    //! @}  end member group
 };
 
 
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 16bd05db81..5a76e73e5e 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -339,11 +339,11 @@ struct DeviceBatchMemcpyPolicy
 };
 
 /**
- * @tparam InputBufferIt <b>[inferred]</b> Random-access input iterator type providing the pointers
+ * @tparam InputBufferIt **[inferred]** Random-access input iterator type providing the pointers
  * to the source memory buffers
- * @tparam OutputBufferIt <b>[inferred]</b> Random-access input iterator type providing the pointers
+ * @tparam OutputBufferIt **[inferred]** Random-access input iterator type providing the pointers
  * to the destination memory buffers
- * @tparam BufferSizeIteratorT <b>[inferred]</b> Random-access input iterator type providing the
+ * @tparam BufferSizeIteratorT **[inferred]** Random-access input iterator type providing the
  * number of bytes to be copied for each pair of buffers
  * @tparam BufferOffsetT Integer type large enough to hold any offset in [0, num_buffers)
  * @tparam BlockOffsetT Integer type large enough to hold any offset in [0,
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 6678c2cf87..79e48a8ab3 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -139,7 +139,7 @@ DeviceHistogramInitKernel(ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins
  *   Number of channels actively being histogrammed
  *
  * @tparam SampleIteratorT
- *   The input iterator type. \iterator.
+ *   The input iterator type. @iterator.
  *
  * @tparam CounterT
  *   Integer type for counting sample occurrences per histogram bin
@@ -565,7 +565,7 @@ struct dispatch_histogram
  *   Number of channels actively being histogrammed
  *
  * @tparam SampleIteratorT
- *   Random-access input iterator type for reading input items \iterator
+ *   Random-access input iterator type for reading input items @iterator
  *
  * @tparam CounterT
  *   Integer type for counting sample occurrences per histogram bin
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index fe0554c932..07e99da614 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -515,10 +515,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
  *   Value type
  *
  * @tparam BeginOffsetIteratorT
- *   Random-access input iterator type for reading segment beginning offsets \iterator
+ *   Random-access input iterator type for reading segment beginning offsets @iterator
  *
  * @tparam EndOffsetIteratorT
- *   Random-access input iterator type for reading segment ending offsets \iterator
+ *   Random-access input iterator type for reading segment ending offsets @iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
@@ -536,12 +536,12 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
  *   Output values buffer
  *
  * @param[in] d_begin_offsets
- *   Random-access input iterator to the sequence of beginning offsets of length @p num_segments,
+ *   Random-access input iterator to the sequence of beginning offsets of length `num_segments`,
  *   such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup>
  *   data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
  *
  * @param[in] d_end_offsets
- *   Random-access input iterator to the sequence of ending offsets of length @p num_segments,
+ *   Random-access input iterator to the sequence of ending offsets of length `num_segments`,
  *   such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
  *   data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.
  *   If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>,
@@ -1380,11 +1380,11 @@ struct DispatchRadixSort : SelectedPolicy
     //------------------------------------------------------------------------------
 
     /// Device-accessible allocation of temporary storage.
-    //  When NULL, the required allocation size is written to @p temp_storage_bytes and no work is
+    //  When NULL, the required allocation size is written to `temp_storage_bytes` and no work is
     //  done.
     void *d_temp_storage;
 
-    /// Reference to size in bytes of @p d_temp_storage allocation
+    /// Reference to size in bytes of `d_temp_storage` allocation
     size_t &temp_storage_bytes;
 
     /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
@@ -2343,10 +2343,10 @@ struct DispatchRadixSort : SelectedPolicy
      *
      * @param[in] d_temp_storage
      *   Device-accessible allocation of temporary storage. When NULL, the required
-     *   allocation size is written to @p temp_storage_bytes and no work is done.
+     *   allocation size is written to `temp_storage_bytes` and no work is done.
      *
      * @param[in,out] temp_storage_bytes
-     *   Reference to size in bytes of @p d_temp_storage allocation
+     *   Reference to size in bytes of `d_temp_storage` allocation
      *
      * @param[in,out] d_keys
      *   Double-buffer whose current buffer contains the unsorted input keys and,
@@ -2467,10 +2467,10 @@ struct DispatchRadixSort : SelectedPolicy
  *   Value type
  *
  * @tparam BeginOffsetIteratorT
- *   Random-access input iterator type for reading segment beginning offsets \iterator
+ *   Random-access input iterator type for reading segment beginning offsets @iterator
  *
  * @tparam EndOffsetIteratorT
- *   Random-access input iterator type for reading segment ending offsets \iterator
+ *   Random-access input iterator type for reading segment ending offsets @iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
@@ -2497,10 +2497,10 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
     //------------------------------------------------------------------------------
 
     /// Device-accessible allocation of temporary storage.  When NULL, the required allocation size
-    /// is written to @p temp_storage_bytes and no work is done.
+    /// is written to `temp_storage_bytes` and no work is done.
     void *d_temp_storage;
 
-    /// Reference to size in bytes of @p d_temp_storage allocation
+    /// Reference to size in bytes of `d_temp_storage` allocation
     size_t &temp_storage_bytes;
 
     /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
@@ -2517,12 +2517,12 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
     /// The number of segments that comprise the sorting data
     OffsetT num_segments;
 
-    /// Random-access input iterator to the sequence of beginning offsets of length @p num_segments,
+    /// Random-access input iterator to the sequence of beginning offsets of length `num_segments`,
     /// such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup>
     /// data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
     BeginOffsetIteratorT d_begin_offsets;
 
-    /// Random-access input iterator to the sequence of ending offsets of length @p num_segments,
+    /// Random-access input iterator to the sequence of ending offsets of length `num_segments`,
     /// such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
     /// data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>. If <tt>d_end_offsets[i]-1</tt>
     /// <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
@@ -2862,10 +2862,10 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
      *
      * @param[in] d_temp_storage
      *   Device-accessible allocation of temporary storage.  When NULL, the required allocation size
-     *   is written to @p temp_storage_bytes and no work is done.
+     *   is written to `temp_storage_bytes` and no work is done.
      *
      * @param[in,out] temp_storage_bytes
-     *   Reference to size in bytes of @p d_temp_storage allocation
+     *   Reference to size in bytes of `d_temp_storage` allocation
      *
      * @param[in,out] d_keys
      *   Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
@@ -2883,11 +2883,11 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
      *
      * @param[in] d_begin_offsets
      *   Random-access input iterator to the sequence of beginning offsets of length
-     *   @p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the
+     *   `num_segments`, such that <tt>d_begin_offsets[i]</tt> is the first element of the
      *   <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
      *
      * @param[in] d_end_offsets
-     *   Random-access input iterator to the sequence of ending offsets of length @p num_segments,
+     *   Random-access input iterator to the sequence of ending offsets of length `num_segments`,
      *   such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
      *   data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.
      *   If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>,
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 553d52b0a5..e429c2edbb 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -129,7 +129,7 @@ __host__ __device__ void finalize_and_store_aggregate(OutputIteratorT d_out,
  *   Chained tuning policy
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items \iterator
+ *   Random-access input iterator type for reading input items @iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
@@ -205,10 +205,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy:
  *   Chained tuning policy
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items \iterator
+ *   Random-access input iterator type for reading input items @iterator
  *
  * @tparam OutputIteratorT
- *   Output iterator type for recording the reduced aggregate \iterator
+ *   Output iterator type for recording the reduced aggregate @iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
@@ -311,18 +311,18 @@ NormalizeReductionOutput(KeyValuePairT &val,
  *   Chained tuning policy
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items \iterator
+ *   Random-access input iterator type for reading input items @iterator
  *
  * @tparam OutputIteratorT
- *   Output iterator type for recording the reduced aggregate \iterator
+ *   Output iterator type for recording the reduced aggregate @iterator
  *
  * @tparam BeginOffsetIteratorT
  *   Random-access input iterator type for reading segment beginning offsets
- *   \iterator
+ *   @iterator
  *
  * @tparam EndOffsetIteratorT
  *   Random-access input iterator type for reading segment ending offsets
- *   \iterator
+ *   @iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
@@ -520,10 +520,10 @@ struct DeviceReducePolicy
  *        device-wide reduction
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items \iterator
+ *   Random-access input iterator type for reading input items @iterator
  *
  * @tparam OutputIteratorT
- *   Output iterator type for recording the reduced aggregate \iterator
+ *   Output iterator type for recording the reduced aggregate @iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
@@ -1016,10 +1016,10 @@ struct DispatchReduce : SelectedPolicy
  *        device-wide transpose reduce
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items \iterator
+ *   Random-access input iterator type for reading input items @iterator
  *
  * @tparam OutputIteratorT
- *   Output iterator type for recording the reduced aggregate \iterator
+ *   Output iterator type for recording the reduced aggregate @iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
@@ -1058,18 +1058,18 @@ using DispatchTransformReduce =
  *        device-wide reduction
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items \iterator
+ *   Random-access input iterator type for reading input items @iterator
  *
  * @tparam OutputIteratorT
- *   Output iterator type for recording the reduced aggregate \iterator
+ *   Output iterator type for recording the reduced aggregate @iterator
  *
  * @tparam BeginOffsetIteratorT
  *   Random-access input iterator type for reading segment beginning offsets
- *   \iterator
+ *   @iterator
  *
  * @tparam EndOffsetIteratorT
  *   Random-access input iterator type for reading segment ending offsets
- *   \iterator
+ *   @iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 8717792ee8..8ad76e591a 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -76,16 +76,16 @@ CUB_NAMESPACE_BEGIN
  *   Parameterized AgentRlePolicyT tuning policy type
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items \iterator
+ *   Random-access input iterator type for reading input items @iterator
  *
  * @tparam OffsetsOutputIteratorT
- *   Random-access output iterator type for writing run-offset values \iterator
+ *   Random-access output iterator type for writing run-offset values @iterator
  *
  * @tparam LengthsOutputIteratorT
- *   Random-access output iterator type for writing run-length values \iterator
+ *   Random-access output iterator type for writing run-length values @iterator
  *
  * @tparam NumRunsOutputIteratorT
- *   Output iterator type for recording the number of runs encountered \iterator
+ *   Output iterator type for recording the number of runs encountered @iterator
  *
  * @tparam ScanTileStateT
  *   Tile status interface type
@@ -164,16 +164,16 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA
  * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading input items \iterator
+ *   Random-access input iterator type for reading input items @iterator
  *
  * @tparam OffsetsOutputIteratorT
- *   Random-access output iterator type for writing run-offset values \iterator
+ *   Random-access output iterator type for writing run-offset values @iterator
  *
  * @tparam LengthsOutputIteratorT
- *   Random-access output iterator type for writing run-length values \iterator
+ *   Random-access output iterator type for writing run-length values @iterator
  *
  * @tparam NumRunsOutputIteratorT
- *   Output iterator type for recording the number of runs encountered \iterator
+ *   Output iterator type for recording the number of runs encountered @iterator
  *
  * @tparam EqualityOpT
  *   T equality operator type
@@ -502,7 +502,7 @@ struct DeviceRleDispatch
    *   Total number of input items (i.e., length of `d_in`)
    *
    * @param stream
-   *   <b>[optional]</b> CUDA stream to launch kernels within.
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 9f4e1bc29d..11a6324794 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -125,10 +125,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel(ScanTileStateT tile_st
  *   Chained tuning policy
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading scan inputs \iterator
+ *   Random-access input iterator type for reading scan inputs @iterator
  *
  * @tparam OutputIteratorT
- *   Random-access output iterator type for writing scan outputs \iterator
+ *   Random-access output iterator type for writing scan outputs @iterator
  *
  * @tparam ScanTileStateT
  *   Tile status interface type
@@ -214,10 +214,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
  *        DeviceScan
  *
  * @tparam InputIteratorT
- *   Random-access input iterator type for reading scan inputs \iterator
+ *   Random-access input iterator type for reading scan inputs @iterator
  *
  * @tparam OutputIteratorT
- *   Random-access output iterator type for writing scan outputs \iterator
+ *   Random-access output iterator type for writing scan outputs @iterator
  *
  * @tparam ScanOpT
  *   Binary scan functor type having member
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index f4dfae06ef..1f87e99c79 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -96,12 +96,12 @@ CUB_NAMESPACE_BEGIN
  *
  * @param[in] d_begin_offsets
  *   Random-access input iterator to the sequence of beginning offsets of length
- *   @p num_segments, such that `d_begin_offsets[i]` is the first element of the
+ *   `num_segments`, such that `d_begin_offsets[i]` is the first element of the
  *   i-th data segment in `d_keys_*` and `d_values_*`
  *
  * @param[in] d_end_offsets
  *   Random-access input iterator to the sequence of ending offsets of length
- *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of the
+ *   `num_segments`, such that `d_end_offsets[i]-1` is the last element of the
  *   i-th data segment in `d_keys_*` and `d_values_*`.
  *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
  *   considered empty.
@@ -291,12 +291,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
  *
  * @param[in] d_begin_offsets
  *   Random-access input iterator to the sequence of beginning offsets of length
- *   @p num_segments, such that `d_begin_offsets[i]` is the first element of the
+ *   `num_segments`, such that `d_begin_offsets[i]` is the first element of the
  *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`
  *
  * @param[in] d_end_offsets
  *   Random-access input iterator to the sequence of ending offsets of length
- *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of the
+ *   `num_segments`, such that `d_end_offsets[i]-1` is the last element of the
  *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If
  *   `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the <em>i</em><sup>th</sup> is
  *   considered empty.
@@ -420,12 +420,12 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
  *
  * @param[in] d_begin_offsets
  *   Random-access input iterator to the sequence of beginning offsets of length
- *   @p num_segments, such that `d_begin_offsets[i]` is the first element of the
+ *   `num_segments`, such that `d_begin_offsets[i]` is the first element of the
  *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`
  *
  * @param[in] d_end_offsets
  *   Random-access input iterator to the sequence of ending offsets of length
- *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of the
+ *   `num_segments`, such that `d_end_offsets[i]-1` is the last element of the
  *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If
  *   `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the <em>i</em><sup>th</sup> is
  *   considered empty.
@@ -1151,12 +1151,12 @@ struct DispatchSegmentedSort : SelectedPolicy
 
   /**
    * Device-accessible allocation of temporary storage. When `nullptr`, the
-   * required allocation size is written to @p temp_storage_bytes and no work
+   * required allocation size is written to `temp_storage_bytes` and no work
    * is done.
    */
   void *d_temp_storage;
 
-  /// Reference to size in bytes of @p d_temp_storage allocation
+  /// Reference to size in bytes of `d_temp_storage` allocation
   std::size_t &temp_storage_bytes;
 
   /**
@@ -1179,14 +1179,14 @@ struct DispatchSegmentedSort : SelectedPolicy
 
   /**
    * Random-access input iterator to the sequence of beginning offsets of length
-   * @p num_segments, such that `d_begin_offsets[i]` is the first element of the
+   * `num_segments`, such that `d_begin_offsets[i]` is the first element of the
    * <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`
    */
   BeginOffsetIteratorT d_begin_offsets;
 
   /**
    * Random-access input iterator to the sequence of ending offsets of length
-   * @p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element
+   * `num_segments`, such that <tt>d_end_offsets[i]-1</tt> is the last element
    * of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
    * `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`,
    * the <em>i</em><sup>th</sup> is considered empty.
diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
index a891f5a273..b2538c27bf 100644
--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -612,7 +612,7 @@ struct DispatchSpmv
      * @param[in] d_temp_storage
      *   Device-accessible allocation of temporary storage.
      *   When NULL, the required allocation size is written to
-     *   @p temp_storage_bytes and no work is done.
+     *   `temp_storage_bytes` and no work is done.
      *
      * @param[in,out] temp_storage_bytes
      *   Reference to size in bytes of \p d_temp_storage allocation
@@ -978,16 +978,16 @@ struct DispatchSpmv
      * @param[in] d_temp_storage
      *   Device-accessible allocation of temporary storage.
      *   When NULL, the required allocation size is written to
-     *   @p temp_storage_bytes and no work is done.
+     *   `temp_storage_bytes` and no work is done.
      *
      * @param[in,out] temp_storage_bytes
-     *   Reference to size in bytes of @p d_temp_storage allocation
+     *   Reference to size in bytes of `d_temp_storage` allocation
      *
      * @param SpMV spmv_params
      *   input parameter bundle
      *
      * @param[in] stream
-     *   <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+     *   **[optional]** CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
      */
     CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage,
                                                                      size_t &temp_storage_bytes,
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index 2bf2760299..2fd3a84ee0 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -211,10 +211,10 @@ struct DispatchUniqueByKey : SelectedPolicy
     using ScanTileStateT = ScanTileState<OffsetT>;
 
     /// Device-accessible allocation of temporary storage.  When NULL, the required allocation size
-    /// is written to @p temp_storage_bytes and no work is done.
+    /// is written to `temp_storage_bytes` and no work is done.
     void *d_temp_storage;
 
-    /// Reference to size in bytes of @p d_temp_storage allocation
+    /// Reference to size in bytes of `d_temp_storage` allocation
     size_t &temp_storage_bytes;
 
     /// Pointer to the input sequence of keys
@@ -239,17 +239,17 @@ struct DispatchUniqueByKey : SelectedPolicy
     /// Total number of input items (i.e., length of @p d_keys_in or @p d_values_in)
     OffsetT num_items;
 
-    /// <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    /// **[optional]** CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
     cudaStream_t stream;
 
     /**
      * @param[in] d_temp_storage
      *   Device-accessible allocation of temporary storage.
      *   When NULL, the required allocation size is written to
-     *   @p temp_storage_bytes and no work is done.
+     *   `temp_storage_bytes` and no work is done.
      *
      * @tparam temp_storage_bytes
-     *   [in,out] Reference to size in bytes of @p d_temp_storage allocation
+     *   [in,out] Reference to size in bytes of `d_temp_storage` allocation
      *
      * @param[in] d_keys_in
      *   Pointer to the input sequence of keys
@@ -274,7 +274,7 @@ struct DispatchUniqueByKey : SelectedPolicy
      *   Total number of input items (i.e., length of @p d_keys_in or @p d_values_in)
      *
      * @param[in] stream
-     *   <b>[optional]</b> CUDA stream to launch kernels within.
+     *   **[optional]** CUDA stream to launch kernels within.
      *   Default is stream<sub>0</sub>.
      */
     CUB_RUNTIME_FUNCTION __forceinline__ DispatchUniqueByKey(void *d_temp_storage,
@@ -538,10 +538,10 @@ struct DispatchUniqueByKey : SelectedPolicy
      * @param[in] d_temp_storage
      *   Device-accessible allocation of temporary storage.
      *   When NULL, the required allocation size is written to
-     *   @p temp_storage_bytes and no work is done.
+     *   `temp_storage_bytes` and no work is done.
      *
      * @param[in,out] &temp_storage_bytes
-     *   Reference to size in bytes of @p d_temp_storage allocation
+     *   Reference to size in bytes of `d_temp_storage` allocation
      *
      * @param[in] d_keys_in
      *   Pointer to the input sequence of keys
@@ -566,7 +566,7 @@ struct DispatchUniqueByKey : SelectedPolicy
      *   Total number of input items (i.e., the length of @p d_in)
      *
      * @param[in] stream
-     *   <b>[optional]</b> CUDA stream to launch kernels within.
+     *   **[optional]** CUDA stream to launch kernels within.
      *   Default is stream<sub>0</sub>.
      */
     CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh
index 21b4895396..2249f39428 100644
--- a/cub/cub/grid/grid_barrier.cuh
+++ b/cub/cub/grid/grid_barrier.cuh
@@ -49,12 +49,6 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
 /**
  * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
  */
@@ -221,8 +215,5 @@ public:
     }
 };
 
-
-/** @} */       // end group GridModule
-
 CUB_NAMESPACE_END
 
diff --git a/cub/cub/grid/grid_even_share.cuh b/cub/cub/grid/grid_even_share.cuh
index 30bba4bc28..fac48c20f0 100644
--- a/cub/cub/grid/grid_even_share.cuh
+++ b/cub/cub/grid/grid_even_share.cuh
@@ -52,12 +52,6 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @addtogroup GridModule
- * @{
- */
-
-
 /**
  * @brief GridEvenShare is a descriptor utility for distributing input among
  * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
@@ -226,7 +220,4 @@ public:
 };
 
 
-
-/** @} */       // end group GridModule
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/grid/grid_mapping.cuh b/cub/cub/grid/grid_mapping.cuh
index d9e019539c..3c1e36a9d1 100644
--- a/cub/cub/grid/grid_mapping.cuh
+++ b/cub/cub/grid/grid_mapping.cuh
@@ -46,13 +46,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
 /******************************************************************************
  * Mapping policies
  *****************************************************************************/
@@ -110,8 +103,5 @@ enum GridMappingStrategy
     GRID_MAPPING_DYNAMIC,
 };
 
-
-/** @} */       // end group GridModule
-
 CUB_NAMESPACE_END
 
diff --git a/cub/cub/grid/grid_queue.cuh b/cub/cub/grid/grid_queue.cuh
index 55cfecaa2f..6d62f42e03 100644
--- a/cub/cub/grid/grid_queue.cuh
+++ b/cub/cub/grid/grid_queue.cuh
@@ -49,13 +49,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-
-/**
- * @addtogroup GridModule
- * @{
- */
-
-
 /**
  * @brief GridQueue is a descriptor utility for dynamic queue management.
  *
@@ -242,12 +235,8 @@ __global__ void FillAndResetDrainKernel(
 }
 
 
-
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-
-/** @} */       // end group GridModule
-
 CUB_NAMESPACE_END
 
 
diff --git a/cub/cub/iterator/arg_index_input_iterator.cuh b/cub/cub/iterator/arg_index_input_iterator.cuh
index b561ccd488..f8061ce9b6 100644
--- a/cub/cub/iterator/arg_index_input_iterator.cuh
+++ b/cub/cub/iterator/arg_index_input_iterator.cuh
@@ -59,11 +59,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilIterator
- * @{
- */
-
 /**
  * @brief A random-access input wrapper for pairing dereferenced values with their corresponding
  *        indices (forming \p KeyValuePair tuples).
@@ -277,8 +272,4 @@ public:
     }
 };
 
-
-
-/** @} */       // end group UtilIterator
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/iterator/cache_modified_input_iterator.cuh b/cub/cub/iterator/cache_modified_input_iterator.cuh
index 644cb95c7c..b04963c72c 100644
--- a/cub/cub/iterator/cache_modified_input_iterator.cuh
+++ b/cub/cub/iterator/cache_modified_input_iterator.cuh
@@ -62,13 +62,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-
-
-/**
- * @addtogroup UtilIterator
- * @{
- */
-
 /**
  * @brief A random-access input wrapper for dereferencing array values using a PTX cache load
  *        modifier.
@@ -259,8 +252,4 @@ public:
 #endif
 };
 
-
-
-/** @} */       // end group UtilIterator
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/iterator/cache_modified_output_iterator.cuh b/cub/cub/iterator/cache_modified_output_iterator.cuh
index ee78b9a695..086ae7eb56 100644
--- a/cub/cub/iterator/cache_modified_output_iterator.cuh
+++ b/cub/cub/iterator/cache_modified_output_iterator.cuh
@@ -59,12 +59,6 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @addtogroup UtilIterator
- * @{
- */
-
-
 /**
  * @brief A random-access output wrapper for storing array values using a PTX cache-modifier.
  *
@@ -269,7 +263,4 @@ public:
     }
 };
 
-
-/** @} */       // end group UtilIterator
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/iterator/constant_input_iterator.cuh b/cub/cub/iterator/constant_input_iterator.cuh
index bf6dc1bfdc..3ed7b2b70a 100644
--- a/cub/cub/iterator/constant_input_iterator.cuh
+++ b/cub/cub/iterator/constant_input_iterator.cuh
@@ -58,13 +58,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-
-/**
- * @addtogroup UtilIterator
- * @{
- */
-
-
 /**
  * @brief A random-access input generator for dereferencing a sequence of homogeneous values
  *
@@ -252,7 +245,4 @@ public:
 
 };
 
-
-/** @} */       // end group UtilIterator
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/iterator/counting_input_iterator.cuh b/cub/cub/iterator/counting_input_iterator.cuh
index 384c44790c..f7295fce47 100644
--- a/cub/cub/iterator/counting_input_iterator.cuh
+++ b/cub/cub/iterator/counting_input_iterator.cuh
@@ -64,11 +64,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilIterator
- * @{
- */
-
 /**
  * @brief A random-access input generator for dereferencing a sequence of incrementing integer values.
  *
@@ -248,8 +243,4 @@ public:
 
 };
 
-
-
-/** @} */       // end group UtilIterator
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/iterator/discard_output_iterator.cuh b/cub/cub/iterator/discard_output_iterator.cuh
index c25f5a6e49..0b7ba3ef85 100644
--- a/cub/cub/iterator/discard_output_iterator.cuh
+++ b/cub/cub/iterator/discard_output_iterator.cuh
@@ -56,12 +56,6 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @addtogroup UtilIterator
- * @{
- */
-
-
 /**
  * @brief A discard iterator
  */
@@ -223,7 +217,4 @@ public:
 
 };
 
-
-/** @} */       // end group UtilIterator
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/iterator/tex_obj_input_iterator.cuh b/cub/cub/iterator/tex_obj_input_iterator.cuh
index f5324458d9..94fa277ba3 100644
--- a/cub/cub/iterator/tex_obj_input_iterator.cuh
+++ b/cub/cub/iterator/tex_obj_input_iterator.cuh
@@ -61,13 +61,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilIterator
- * @{
- */
-
-
-
 /**
  * @brief A random-access input wrapper for dereferencing array values through texture cache.
  *        Uses newer Kepler-style texture objects.
@@ -341,8 +334,4 @@ private:
     }
 };
 
-
-
-/** @} */       // end group UtilIterator
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/iterator/tex_ref_input_iterator.cuh b/cub/cub/iterator/tex_ref_input_iterator.cuh
index 18cf46bb96..d1453f1d8c 100644
--- a/cub/cub/iterator/tex_ref_input_iterator.cuh
+++ b/cub/cub/iterator/tex_ref_input_iterator.cuh
@@ -49,11 +49,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilIterator
- * @{
- */
-
 /**
  * @brief A random-access input wrapper for dereferencing array values through texture cache.
  *
@@ -117,6 +112,4 @@ template <
     typename    OffsetT = std::ptrdiff_t>
 using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator<T, OffsetT>;
 
-/** @} */       // end group UtilIterator
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/iterator/transform_input_iterator.cuh b/cub/cub/iterator/transform_input_iterator.cuh
index 7ce36f1741..fb8266aecb 100644
--- a/cub/cub/iterator/transform_input_iterator.cuh
+++ b/cub/cub/iterator/transform_input_iterator.cuh
@@ -58,12 +58,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilIterator
- * @{
- */
-
-
 /**
  * @brief A random-access input wrapper for transforming dereferenced values.
  *
@@ -265,8 +259,4 @@ public:
     }
 };
 
-
-
-/** @} */       // end group UtilIterator
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh
index 4d5d0f7fd3..4642482450 100644
--- a/cub/cub/thread/thread_load.cuh
+++ b/cub/cub/thread/thread_load.cuh
@@ -50,11 +50,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilIo
- * @{
- */
-
 //-----------------------------------------------------------------------------
 // Tags and constants
 //-----------------------------------------------------------------------------
@@ -416,7 +411,4 @@ ThreadLoad(InputIteratorT itr)
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
 
-/** @} */       // end group UtilIo
-
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index c9d521190e..ad9b657f70 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -58,12 +58,6 @@ _CCCL_DIAG_SUPPRESS_DEPRECATED_POP
 
 CUB_NAMESPACE_BEGIN
 
-
-/**
- * @addtogroup UtilModule
- * @{
- */
-
 /// @brief Inequality functor (wraps equality functor)
 template <typename EqualityOp>
 struct InequalityWrapper
@@ -428,7 +422,4 @@ __device__ __host__ BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT binary_op)
   return BinaryFlip<BinaryOpT>(binary_op);
 }
 
-/** @} */       // end group UtilModule
-
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_scan.cuh b/cub/cub/thread/thread_scan.cuh
index 271d14c78a..7a25033b74 100644
--- a/cub/cub/thread/thread_scan.cuh
+++ b/cub/cub/thread/thread_scan.cuh
@@ -50,12 +50,6 @@ CUB_NAMESPACE_BEGIN
 /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
 namespace internal {
 
-
-/**
- * @addtogroup UtilModule
- * @{
- */
-
 /**
  * @name Sequential prefix scan over statically-sized array types
  * @{
@@ -354,8 +348,5 @@ __device__ __forceinline__ T ThreadScanInclusive(T (&input)[LENGTH],
 
 //@}  end member group
 
-/** @} */       // end group UtilModule
-
-
-}               // internal namespace
+} // internal namespace
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh
index fc8b3beb41..62580c7640 100644
--- a/cub/cub/thread/thread_store.cuh
+++ b/cub/cub/thread/thread_store.cuh
@@ -48,12 +48,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilIo
- * @{
- */
-
-
 //-----------------------------------------------------------------------------
 // Tags and constants
 //-----------------------------------------------------------------------------
@@ -414,8 +408,4 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-
-/** @} */       // end group UtilIo
-
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh
index 135e8f8cf2..ba92f6bd02 100644
--- a/cub/cub/util_allocator.cuh
+++ b/cub/cub/util_allocator.cuh
@@ -55,12 +55,6 @@
 CUB_NAMESPACE_BEGIN
 
 
-/**
- * @addtogroup UtilMgmt
- * @{
- */
-
-
 /******************************************************************************
  * CachingDeviceAllocator (host use)
  ******************************************************************************/
@@ -872,9 +866,4 @@ struct CachingDeviceAllocator
 
 };
 
-
-
-
-/** @} */       // end group UtilMgmt
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh
index 44b6f57322..6f476fa807 100644
--- a/cub/cub/util_cpp_dialect.cuh
+++ b/cub/cub/util_cpp_dialect.cuh
@@ -25,9 +25,7 @@
  *
  ******************************************************************************/
 
-/*! \file
- *  \brief Detect the version of the C++ standard used by the compiler.
- */
+//! @file Detect the version of the C++ standard used by the compiler.
 
 #pragma once
 
@@ -43,6 +41,8 @@
 
 #include <cub/util_compiler.cuh>
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 
+
 // Deprecation warnings may be silenced by defining the following macros. These
 // may be combined.
 // - CUB_IGNORE_DEPRECATED_CPP_DIALECT:
@@ -159,3 +159,5 @@
 #undef CUB_COMP_DEPR_IMPL
 #undef CUB_COMP_DEPR_IMPL0
 #undef CUB_COMP_DEPR_IMPL1
+
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh
index c1e0991e4b..693af8c2e3 100644
--- a/cub/cub/util_debug.cuh
+++ b/cub/cub/util_debug.cuh
@@ -94,12 +94,6 @@
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-
 // `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only:
 
 #define CUB_DETAIL_DEBUG_LEVEL_NONE 0
@@ -332,6 +326,4 @@ inline __host__ __device__ void va_printf(char const*, Args const&...)
 #endif
 #endif
 
-/** @} */       // end group UtilMgmt
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index 7c48e8a712..8b7dd6131e 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -65,12 +65,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
 
 namespace detail
@@ -860,8 +854,4 @@ struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
 };
 
 
-
-
-/** @} */       // end group UtilMgmt
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh
index 7956f655fd..cfadd98e13 100644
--- a/cub/cub/util_macro.cuh
+++ b/cub/cub/util_macro.cuh
@@ -49,11 +49,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * \addtogroup UtilModule
- * @{
- */
-
 #ifndef CUB_ALIGN
     #if defined(_WIN32) || defined(_WIN64)
         /// Align struct
@@ -66,6 +61,7 @@ CUB_NAMESPACE_BEGIN
 
 #define CUB_PREVENT_MACRO_SUBSTITUTION
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
 template <typename T, typename U>
 constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
                                                                       U &&u)
@@ -81,6 +77,7 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
 {
   return t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t);
 }
+#endif
 
 #ifndef CUB_MAX
     /// Select maximum(a, b)
@@ -143,6 +140,4 @@ _CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)
 #  endif
 #endif // !CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION
 
-/** @} */       // end group UtilModule
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index b725b0646b..dd18583437 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -50,12 +50,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * \addtogroup UtilPtx
- * @{
- */
-
-
 /******************************************************************************
  * PTX helper macros
  ******************************************************************************/
@@ -515,8 +509,6 @@ __device__ __forceinline__ unsigned int LaneMaskGe()
     return ret;
 }
 
-/** @} */       // end group UtilPtx
-
 /**
  * @brief Shuffle-up for any data type.
  *        Each <em>warp-lane<sub>i</sub></em> obtains the value @p input contributed by
@@ -524,8 +516,6 @@ __device__ __forceinline__ unsigned int LaneMaskGe()
  *        For thread lanes @e i < src_offset, the thread's own @p input is returned to the thread.
  *        ![](shfl_up_logo.png)
  *
- * @ingroup WarpModule
- *
  * @tparam LOGICAL_WARP_THREADS
  *   The number of threads per "logical" warp. Must be a power-of-two <= 32.
  *
@@ -606,8 +596,6 @@ ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask)
  *        For thread lanes @e i >= WARP_THREADS, the thread's own @p input is returned to the
  *        thread. ![](shfl_down_logo.png)
  *
- * @ingroup WarpModule
- *
  * @tparam LOGICAL_WARP_THREADS
  *   The number of threads per "logical" warp.  Must be a power-of-two <= 32.
  *
@@ -696,8 +684,6 @@ ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask)
  * @tparam T
  *   <b>[inferred]</b> The input/output element type
  *
- * @ingroup WarpModule
- *
  * @par
  * - Available only for SM3.0 or newer
  *
diff --git a/cub/cub/util_temporary_storage.cuh b/cub/cub/util_temporary_storage.cuh
index 22020c3fbd..5750b3be6b 100644
--- a/cub/cub/util_temporary_storage.cuh
+++ b/cub/cub/util_temporary_storage.cuh
@@ -49,11 +49,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-/**
- * @addtogroup UtilMgmt
- * @{
- */
-
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
 
 /**
@@ -120,6 +115,4 @@ AliasTemporaries(void *d_temp_storage,
 
 #endif  // DOXYGEN_SHOULD_SKIP_THIS
 
-/** @} */       // end group UtilMgmt
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index 5220647906..6bd60544d5 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -93,13 +93,6 @@ CUB_NAMESPACE_BEGIN
 #endif // !defined(__CUDACC_RTC__)
 #endif // !defined(CUB_IS_INT128_ENABLED)
 
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-
 /******************************************************************************
  * Conditional types
  ******************************************************************************/
@@ -1380,7 +1373,4 @@ struct Traits : NumericTraits<typename ::cuda::std::remove_cv<T>::type> {};
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-
-/** @} */       // end group UtilModule
-
 CUB_NAMESPACE_END
diff --git a/cub/cub/warp/warp_exchange.cuh b/cub/cub/warp/warp_exchange.cuh
index e603ed8d36..805df97cdc 100644
--- a/cub/cub/warp/warp_exchange.cuh
+++ b/cub/cub/warp/warp_exchange.cuh
@@ -73,7 +73,6 @@ using InternalWarpExchangeImpl =
 /**
  * @brief The WarpExchange class provides [<em>collective</em>](index.html#sec0)
  *        methods for rearranging data partitioned across a CUDA warp.
- * @ingroup WarpModule
  *
  * @tparam T
  *   The data type to be exchanged.
diff --git a/cub/docs/benchmarking.rst b/cub/docs/benchmarking.rst
index 590da8002b..a06abb1454 100644
--- a/cub/docs/benchmarking.rst
+++ b/cub/docs/benchmarking.rst
@@ -1,4 +1,4 @@
-Running CUB Benchmarks
+CUB Benchmarks
 *************************************
 
 This file contains instrutions on how to run all CUB benchmarks using CUB tuning infrastructure.
diff --git a/cub/docs/index.rst b/cub/docs/index.rst
index 90f82030ea..f91f4f6c58 100644
--- a/cub/docs/index.rst
+++ b/cub/docs/index.rst
@@ -9,6 +9,7 @@ CUB
    developer_overview
    test_overview
    tuning
+   benchmarking
 
 .. the line below can be used to use the README.md file as the index page
 .. .. mdinclude:: ../README.md
diff --git a/cub/docs/repo.toml b/cub/docs/repo.toml
index 7b33463955..23bc013609 100644
--- a/cub/docs/repo.toml
+++ b/cub/docs/repo.toml
@@ -74,7 +74,8 @@ doxygen_aliases = [
   "warpstriped=Assumes a *warp-striped arrangement* of elements across threads, where warp\\ :sub:`i` owns the *i*\\ :sup:`th` range of (*warp-threads* * *items-per-thread*) contiguous items, and each thread owns items (*i*), (*i* + *warp-threads*), ..., (*i* + (*warp-threads* * (*items-per-thread* - 1))).",
   "linear_performance{1}=The work-complexity of \\1 as a function of input size is linear, resulting in performance throughput that plateaus with problem sizes large enough to saturate the GPU."  ,
   "plots_below=Performance plots for other scenarios can be found in the detailed method descriptions below.",
-  "identityzero=This operation assumes the value of obtained by the ``T``'s default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value \"zero\" for addition."
+  "identityzero=This operation assumes the value of obtained by the ``T``'s default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value \"zero\" for addition.",
+  "lookback=`decoupled look-back <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_"
 ]
 
 # doxygen sometimes gets confused by macros. the array below allows the user to

From c4769d7777336f7ce9fe38a12656b8401c5f8765 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Thu, 30 Nov 2023 15:10:26 -0600
Subject: [PATCH 2/4] CI log improvements (#621)

* Add groups to script steps.

* [skip-tests] missing quote

* [skip-tests] Use function to only print group in GHA.

* Fix color.

* [skip-tests] Add group for env details.

* [skip-tests] Add group to run_as_coder repro instructions.

* Don't error on unbound.

* Don't print script args

* Color coder print message.

* Avoid unbound errors with GITHUB_ACTIONS.

* Don't run nvidia-smi manually in the test job.

* sccache stats group.

* Avoid sccache stats if sccache is not available.

* [skip-tests] Inject intentional error.

* Revert "[skip-tests] Inject intentional error."

This reverts commit 7270a0cd0c5efd3e14f09bdb6355731e3bdf4a15.

* Use preset name in group name.

* Parameterize color.

* Print sccache stats in group.

* Add problem matcher.

* Add problem matcher before moving repo files.

* Remove the cmake regexs for now.

* Try different problem matcher.

* Just remove problem matchers for now.

* Fix if

* Remove redundant sccache stats.

* Try adding problem matcher again.

* Fix problem-matcher file name.

* [skip-tests] Run smaller matrix for debug.

* Fix path.

* Use json array for matcher.

* Fix json array.

* [skip-tests] Disable verify devcontainers for now.

* disable verify-devcontainers

* Exclude home/coder from the path in the matcher.

* Try a different regex.

* Exclude leading slash.

* Run as coder user.

* Revert "Run as coder user."

This reverts commit dace5f6963f58c4f545d0bf962aac2b987b6d75d.

* Add ninja summary stats.

* Fix permissions of ninja summary script.

* Make color conditional upon status.

* Make sure to get correct build status.

* Exit if build failed.

* Fix if statement.

* Print when build fails.

* Disable exiting on non-zero return.

* Don't use local, it resets exit code.

* Fix variable name.

* Emit error.

* Make sccache stats part of group title.

* Make repro instructions a conditional step.

* Get rid of old code.

* Go back to putting the repro instructions in the command step.

* Don't output error::.

* Update problem matcher.

* Don't capture cmake output.

* Fix group name.

* Actually disable exiting on non-zero return.

* Add echo -e.

* Fix spacing.

* Redundant "build".

* Add space to fix emoji.

* Move end message logic into end group.

* Fix group name.

* Don't print in GHA on success.

* Fix emojis.

* Refactor group command logic into function.

* Docs.

* Return status from run_command.

* Revert test changes.

* Update repro instructions.

* Remove excess.

* Use print_env_details directly to avoid duplicates.

* Update problem-matcher.json

* Add timing to build/test scripts.
---
 .github/problem-matchers/problem-matcher.json |  14 +
 .github/workflows/build-and-test-linux.yml    |   1 -
 .github/workflows/run-as-coder.yml            |  22 +-
 ci/build_common.sh                            | 198 +++++++--
 ci/build_cub.sh                               |   4 +
 ci/build_libcudacxx.sh                        |   4 +
 ci/build_thrust.sh                            |   4 +
 ci/ninja_summary.py                           | 381 ++++++++++++++++++
 ci/nvrtc_libcudacxx.sh                        |   2 +
 ci/sccache_stats.sh                           |   8 +-
 ci/test_cub.sh                                |   4 +
 ci/test_libcudacxx.sh                         |   4 +
 ci/test_thrust.sh                             |   4 +
 13 files changed, 606 insertions(+), 44 deletions(-)
 create mode 100644 .github/problem-matchers/problem-matcher.json
 create mode 100755 ci/ninja_summary.py

diff --git a/.github/problem-matchers/problem-matcher.json b/.github/problem-matchers/problem-matcher.json
new file mode 100644
index 0000000000..f196a5c884
--- /dev/null
+++ b/.github/problem-matchers/problem-matcher.json
@@ -0,0 +1,14 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "nvcc",
+      "pattern": [
+        {
+          "regexp": "^\\/home\\/coder\\/(.+):(\\d+):(\\d+): (\\w+): \"(.+)\"$",
+          "severity": 4,
+          "message": 5
+        }
+      ]
+    }
+  ]
+}
diff --git a/.github/workflows/build-and-test-linux.yml b/.github/workflows/build-and-test-linux.yml
index 32cfc25951..6c5ba40061 100644
--- a/.github/workflows/build-and-test-linux.yml
+++ b/.github/workflows/build-and-test-linux.yml
@@ -44,5 +44,4 @@ jobs:
       runner: linux-${{inputs.cpu}}-gpu-v100-latest-1
       image: ${{inputs.container_image}}
       command: |
-        nvidia-smi
         ${{ inputs.test_script }}
diff --git a/.github/workflows/run-as-coder.yml b/.github/workflows/run-as-coder.yml
index 6d09fd220f..40bbf97b29 100644
--- a/.github/workflows/run-as-coder.yml
+++ b/.github/workflows/run-as-coder.yml
@@ -39,18 +39,30 @@ jobs:
         run: |
           cp -R cccl /home/coder/cccl
           chown -R coder:coder /home/coder/
+      - name: Add NVCC problem matcher
+        run: |
+          echo "::add-matcher::cccl/.github/problem-matchers/problem-matcher.json"
       - name: Configure credentials and environment variables for sccache
         uses: ./cccl/.github/actions/configure_cccl_sccache
       - name: Run command
         shell: su coder {0}
         run: |
-            set -exo pipefail
+            set -eo pipefail
             cd ~/cccl
+            echo -e "\e[1;34mRunning as 'coder' user in $(pwd):\e[0m"
+            echo -e "\e[1;34m${{inputs.command}}\e[0m"
             eval "${{inputs.command}}" || exit_code=$?
             if [ ! -z "$exit_code" ]; then
-                echo "::error::Error! To checkout the corresponding code and reproduce locally, run the following commands:"
-                echo "git clone --branch $GITHUB_REF_NAME --single-branch --recurse-submodules https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
-                echo "docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
-                exit $exit_code
+              echo -e "::group::️❗ \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m"
+              echo "::error:: To replicate this failure locally, follow the steps below:"
+              echo "1. Clone the repository, and navigate to the correct branch and commit:"
+              echo "   git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
+              echo ""
+              echo "2. Run the failed command inside the same Docker container used by the CI:"
+              echo "   docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
+              echo ""
+              echo "For additional information, see:"
+              echo "   - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md"
+              echo "   - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md"
             fi
 
diff --git a/ci/build_common.sh b/ci/build_common.sh
index b398d5e582..7959acfb59 100755
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -37,7 +37,6 @@ function usage {
 # Copy the args into a temporary array, since we will modify them and
 # the parent script may still need them.
 args=("$@")
-echo "Args: ${args[@]}"
 while [ "${#args[@]}" -ne 0 ]; do
     case "${args[0]}" in
     -v | --verbose) VERBOSE=1; args=("${args[@]:1}");;
@@ -90,7 +89,6 @@ export CTEST_PARALLEL_LEVEL="1"
 export CXX="${HOST_COMPILER}"
 export CUDACXX="${CUDA_COMPILER}"
 export CUDAHOSTCXX="${HOST_COMPILER}"
-
 export CXX_STANDARD
 
 # Print "ARG=${ARG}" for all args.
@@ -107,67 +105,193 @@ function print_var_values() {
     done
 }
 
-echo "========================================"
-echo "pwd=$(pwd)"
-print_var_values \
-    BUILD_DIR \
-    CXX_STANDARD \
-    CXX \
-    CUDACXX \
-    CUDAHOSTCXX \
-    NVCC_VERSION \
-    CMAKE_BUILD_PARALLEL_LEVEL \
-    CTEST_PARALLEL_LEVEL \
-    CCCL_BUILD_INFIX \
-    GLOBAL_CMAKE_OPTIONS
-echo "========================================"
-echo
-echo "========================================"
-echo "Current commit is:"
-git log -1 || echo "Not a repository"
-echo "========================================"
-echo
+# begin_group: Start a named section of log output, possibly with color.
+# Usage: begin_group "Group Name" [Color]
+#   Group Name: A string specifying the name of the group.
+#   Color (optional): ANSI color code to set text color. Default is blue (1;34).
+function begin_group() {
+    # See options for colors here: https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124
+    local blue="34"
+    local name="${1:-}"
+    local color="${2:-$blue}"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo -e "::group::\e[${color}m${name}\e[0m"
+    else
+        echo -e "\e[${color}m================== ${name} ======================\e[0m"
+    fi
+}
+
+# end_group: End a named section of log output and print status based on exit status.
+# Usage: end_group "Group Name" [Exit Status]
+#   Group Name: A string specifying the name of the group.
+#   Exit Status (optional): The exit status of the command run within the group. Default is 0.
+function end_group() {
+    local name="${1:-}"
+    local build_status="${2:-0}"
+    local duration="${3:-}"
+    local red="31"
+    local blue="34"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo "::endgroup::"
+
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "::error::\e[${red}m ${name} - Failed (⬆️ click above for full log ⬆️)\e[0m"
+        fi
+    else
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "\e[${red}m================== End ${name} - Failed${duration:+ - Duration: ${duration}s} ==================\e[0m"
+        else
+            echo -e "\e[${blue}m================== End ${name} - Success${duration:+ - Duration: ${duration}s} ==================\n\e[0m"
+        fi
+    fi
+}
+
+declare -A command_durations
+
+# Runs a command within a named group, handles the exit status, and prints appropriate messages based on the result.
+# Usage: run_command "Group Name" command [arguments...]
+function run_command() {
+    local group_name="${1:-}"
+    shift
+    local command=("$@")
+    local status
+
+    begin_group "$group_name"
+    set +e
+    local start_time=$(date +%s)
+    "${command[@]}"
+    status=$?
+    local end_time=$(date +%s)
+    set -e
+    local duration=$((end_time - start_time))
+    end_group "$group_name" $status $duration
+    command_durations["$group_name"]=$duration
+    return $status
+}
+
+function string_width() {
+    local str="$1"
+    echo "$str" | awk '{print length}'
+}
+
+function print_time_summary() {
+    local max_length=0
+    local group
+
+    # Find the longest group name for formatting
+    for group in "${!command_durations[@]}"; do
+        local group_length=$(echo "$group" | awk '{print length}')
+        if [ "$group_length" -gt "$max_length" ]; then
+            max_length=$group_length
+        fi
+    done
+
+    echo "Time Summary:"
+    for group in "${!command_durations[@]}"; do
+        printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}"
+    done
+
+    # Clear the array of timing info
+    declare -gA command_durations=()
+}
+
+
+print_environment_details() {
+  begin_group "⚙️ Environment Details"
+
+  echo "pwd=$(pwd)"
+
+  print_var_values \
+      BUILD_DIR \
+      CXX_STANDARD \
+      CXX \
+      CUDACXX \
+      CUDAHOSTCXX \
+      NVCC_VERSION \
+      CMAKE_BUILD_PARALLEL_LEVEL \
+      CTEST_PARALLEL_LEVEL \
+      CCCL_BUILD_INFIX \
+      GLOBAL_CMAKE_OPTIONS
+
+  echo "Current commit is:"
+  git log -1 || echo "Not a repository"
+
+  if command -v nvidia-smi &> /dev/null; then
+    nvidia-smi
+  else
+    echo "nvidia-smi not found"
+  fi
+
+  end_group "⚙️ Environment Details"
+}
+
 
 function configure_preset()
 {
     local BUILD_NAME=$1
     local PRESET=$2
     local CMAKE_OPTIONS=$3
+    local GROUP_NAME="🛠️  CMake Configure ${BUILD_NAME}"
 
     pushd .. > /dev/null
-
-    cmake --preset=$PRESET --log-level=VERBOSE $GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS
-    echo "$BUILD_NAME configure complete."
-
+    run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE $GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS
+    status=$?
     popd > /dev/null
+    return $status
 }
 
-function build_preset()
-{
+function build_preset() {
     local BUILD_NAME=$1
     local PRESET=$2
+    local green="1;32"
+    local red="1;31"
+    local GROUP_NAME="🏗️  Build ${BUILD_NAME}"
 
     source "./sccache_stats.sh" "start"
-    pushd .. > /dev/null
-
-    cmake --build --preset=$PRESET -v
-    echo "$BUILD_NAME build complete."
 
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --build --preset=$PRESET -v
+    status=$?
     popd > /dev/null
-    source "./sccache_stats.sh" "end"
+
+    minimal_sccache_stats=$(source "./sccache_stats.sh" "end")
+
+    # Only print detailed stats in actions workflow
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        begin_group "💲 sccache stats"
+        echo "${minimal_sccache_stats}"
+        sccache -s
+        end_group
+
+        begin_group "🥷 ninja build times"
+        echo "The "weighted" time is the elapsed time of each build step divided by the number
+              of tasks that were running in parallel. This makes it an excellent approximation
+              of how "important" a slow step was. A link that is entirely or mostly serialized
+              will have a weighted time that is the same or similar to its elapsed time. A
+              compile that runs in parallel with 999 other compiles will have a weighted time
+              that is tiny."
+        ./ninja_summary.py -C ${BUILD_DIR}/${PRESET}
+        end_group
+    else
+      echo $minimal_sccache_stats
+    fi
+
+    return $status
 }
 
 function test_preset()
 {
     local BUILD_NAME=$1
     local PRESET=$2
+    local GROUP_NAME="🚀  Test ${BUILD_NAME}"
 
     pushd .. > /dev/null
-
-    ctest --preset=$PRESET
-    echo "$BUILD_NAME testing complete."
-
+    run_command "$GROUP_NAME" ctest --preset=$PRESET
+    status=$?
     popd > /dev/null
+    return $status
 }
 
 function configure_and_build_preset()
diff --git a/ci/build_cub.sh b/ci/build_cub.sh
index f31ec4fd2a..d587c2a6ad 100755
--- a/ci/build_cub.sh
+++ b/ci/build_cub.sh
@@ -2,6 +2,8 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 # CUB benchmarks require at least CUDA nvcc 11.5 for int128
 # Returns "true" if the first version is greater than or equal to the second
 version_compare() {
@@ -35,3 +37,5 @@ CMAKE_OPTIONS="
 "
 
 configure_and_build_preset "CUB" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/build_libcudacxx.sh b/ci/build_libcudacxx.sh
index 656851253a..1dc26f3228 100755
--- a/ci/build_libcudacxx.sh
+++ b/ci/build_libcudacxx.sh
@@ -2,7 +2,11 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 PRESET="libcudacxx-cpp${CXX_STANDARD}"
 CMAKE_OPTIONS=""
 
 configure_and_build_preset libcudacxx "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/build_thrust.sh b/ci/build_thrust.sh
index 887f33b34b..6e4a82da0f 100755
--- a/ci/build_thrust.sh
+++ b/ci/build_thrust.sh
@@ -2,8 +2,12 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 PRESET="thrust-cpp$CXX_STANDARD"
 
 CMAKE_OPTIONS=""
 
 configure_and_build_preset "Thrust" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/ninja_summary.py b/ci/ninja_summary.py
new file mode 100755
index 0000000000..f496db534b
--- /dev/null
+++ b/ci/ninja_summary.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+# Copyright (c) 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+r"""Summarize the last ninja build, invoked with ninja's -C syntax.
+
+This script is designed to be automatically run after each ninja build in
+order to summarize the build's performance. Making build performance information
+more visible should make it easier to notice anomalies and opportunities. To use
+this script on Windows just set NINJA_SUMMARIZE_BUILD=1 and run autoninja.bat.
+
+On Linux you can get autoninja to invoke this script using this syntax:
+
+$ NINJA_SUMMARIZE_BUILD=1 autoninja -C out/Default/ chrome
+
+You can also call this script directly using ninja's syntax to specify the
+output directory of interest:
+
+> python3 post_build_ninja_summary.py -C out/Default
+
+Typical output looks like this:
+
+>ninja -C out\debug_component base
+ninja.exe -C out\debug_component base -j 960 -l 48  -d keeprsp
+ninja: Entering directory `out\debug_component'
+[1 processes, 1/1 @ 0.3/s : 3.092s ] Regenerating ninja files
+Longest build steps:
+       0.1 weighted s to build obj/base/base/trace_log.obj (6.7 s elapsed time)
+       0.2 weighted s to build nasm.exe, nasm.exe.pdb (0.2 s elapsed time)
+       0.3 weighted s to build obj/base/base/win_util.obj (12.4 s elapsed time)
+       1.2 weighted s to build base.dll, base.dll.lib (1.2 s elapsed time)
+Time by build-step type:
+       0.0 s weighted time to generate 6 .lib files (0.3 s elapsed time sum)
+       0.1 s weighted time to generate 25 .stamp files (1.2 s elapsed time sum)
+       0.2 s weighted time to generate 20 .o files (2.8 s elapsed time sum)
+       1.7 s weighted time to generate 4 PEFile (linking) files (2.0 s elapsed
+time sum)
+      23.9 s weighted time to generate 770 .obj files (974.8 s elapsed time sum)
+26.1 s weighted time (982.9 s elapsed time sum, 37.7x parallelism)
+839 build steps completed, average of 32.17/s
+
+If no gn clean has been done then results will be for the last non-NULL
+invocation of ninja. Ideas for future statistics, and implementations are
+appreciated.
+
+The "weighted" time is the elapsed time of each build step divided by the number
+of tasks that were running in parallel. This makes it an excellent approximation
+of how "important" a slow step was. A link that is entirely or mostly serialized
+will have a weighted time that is the same or similar to its elapsed time. A
+compile that runs in parallel with 999 other compiles will have a weighted time
+that is tiny."""
+
+import argparse
+import errno
+import fnmatch
+import os
+import subprocess
+import sys
+
+# The number of long build times to report:
+long_count = 10
+# The number of long times by extension to report
+long_ext_count = 10
+
+
+class Target:
+    """Represents a single line read for a .ninja_log file."""
+    def __init__(self, start, end):
+        """Creates a target object by passing in the start/end times in seconds
+        as a float."""
+        self.start = start
+        self.end = end
+        # A list of targets, appended to by the owner of this object.
+        self.targets = []
+        self.weighted_duration = 0.0
+
+    def Duration(self):
+        """Returns the task duration in seconds as a float."""
+        return self.end - self.start
+
+    def SetWeightedDuration(self, weighted_duration):
+        """Sets the duration, in seconds, passed in as a float."""
+        self.weighted_duration = weighted_duration
+
+    def WeightedDuration(self):
+        """Returns the task's weighted duration in seconds as a float.
+
+        Weighted_duration takes the elapsed time of the task and divides it
+        by how many other tasks were running at the same time. Thus, it
+        represents the approximate impact of this task on the total build time,
+        with serialized or serializing steps typically ending up with much
+        longer weighted durations.
+        weighted_duration should always be the same or shorter than duration.
+        """
+        # Allow for modest floating-point errors
+        epsilon = 0.000002
+        if (self.weighted_duration > self.Duration() + epsilon):
+            print('%s > %s?' % (self.weighted_duration, self.Duration()))
+        assert (self.weighted_duration <= self.Duration() + epsilon)
+        return self.weighted_duration
+
+    def DescribeTargets(self):
+        """Returns a printable string that summarizes the targets."""
+        # Some build steps generate dozens of outputs - handle them sanely.
+        # The max_length was chosen so that it can fit most of the long
+        # single-target names, while minimizing word wrapping.
+        result = ', '.join(self.targets)
+        max_length = 65
+        if len(result) > max_length:
+            result = result[:max_length] + '...'
+        return result
+
+
+# Copied with some modifications from ninjatracing
+def ReadTargets(log, show_all):
+    """Reads all targets from .ninja_log file |log_file|, sorted by duration.
+
+    The result is a list of Target objects."""
+    header = log.readline()
+    # Handle empty ninja_log gracefully by silently returning an empty list of
+    # targets.
+    if not header:
+        return []
+    assert header == '# ninja log v5\n', \
+           'unrecognized ninja log version %r' % header
+    targets_dict = {}
+    last_end_seen = 0.0
+    for line in log:
+        parts = line.strip().split('\t')
+        if len(parts) != 5:
+            # If ninja.exe is rudely halted then the .ninja_log file may be
+            # corrupt. Silently continue.
+            continue
+        start, end, _, name, cmdhash = parts  # Ignore restat.
+        # Convert from integral milliseconds to float seconds.
+        start = int(start) / 1000.0
+        end = int(end) / 1000.0
+        if not show_all and end < last_end_seen:
+            # An earlier time stamp means that this step is the first in a new
+            # build, possibly an incremental build. Throw away the previous
+            # data so that this new build will be displayed independently.
+            # This has to be done by comparing end times because records are
+            # written to the .ninja_log file when commands complete, so end
+            # times are guaranteed to be in order, but start times are not.
+            targets_dict = {}
+        target = None
+        if cmdhash in targets_dict:
+            target = targets_dict[cmdhash]
+            if not show_all and (target.start != start or target.end != end):
+                # If several builds in a row just run one or two build steps
+                # then the end times may not go backwards so the last build may
+                # not be detected as such. However in many cases there will be a
+                # build step repeated in the two builds and the changed
+                # start/stop points for that command, identified by the hash,
+                # can be used to detect and reset the target dictionary.
+                targets_dict = {}
+                target = None
+        if not target:
+            targets_dict[cmdhash] = target = Target(start, end)
+        last_end_seen = end
+        target.targets.append(name)
+    return list(targets_dict.values())
+
+
+def GetExtension(target, extra_patterns):
+    """Return the file extension that best represents a target.
+
+  For targets that generate multiple outputs it is important to return a
+  consistent 'canonical' extension. Ultimately the goal is to group build steps
+  by type."""
+    for output in target.targets:
+        if extra_patterns:
+            for fn_pattern in extra_patterns.split(';'):
+                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+                    return fn_pattern
+        # Not a true extension, but a good grouping.
+        if output.endswith('type_mappings'):
+            extension = 'type_mappings'
+            break
+
+        # Capture two extensions if present. For example: file.javac.jar should
+        # be distinguished from file.interface.jar.
+        root, ext1 = os.path.splitext(output)
+        _, ext2 = os.path.splitext(root)
+        extension = ext2 + ext1  # Preserve the order in the file name.
+
+        if len(extension) == 0:
+            extension = '(no extension found)'
+
+        if ext1 in ['.pdb', '.dll', '.exe']:
+            extension = 'PEFile (linking)'
+            # Make sure that .dll and .exe are grouped together and that the
+            # .dll.lib files don't cause these to be listed as libraries
+            break
+        if ext1 in ['.so', '.TOC']:
+            extension = '.so (linking)'
+            # Attempt to identify linking, avoid identifying as '.TOC'
+            break
+        # Make sure .obj files don't get categorized as mojo files
+        if ext1 in ['.obj', '.o']:
+            break
+        # Jars are the canonical output of java targets.
+        if ext1 == '.jar':
+            break
+        # Normalize all mojo related outputs to 'mojo'.
+        if output.count('.mojom') > 0:
+            extension = 'mojo'
+            break
+    return extension
+
+
+def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
+    """Print a summary of the passed in list of Target objects."""
+
+    # Create a list that is in order by time stamp and has entries for the
+    # beginning and ending of each build step (one time stamp may have multiple
+    # entries due to multiple steps starting/stopping at exactly the same time).
+    # Iterate through this list, keeping track of which tasks are running at all
+    # times. At each time step calculate a running total for weighted time so
+    # that when each task ends its own weighted time can easily be calculated.
+    task_start_stop_times = []
+
+    earliest = -1
+    latest = 0
+    total_cpu_time = 0
+    for target in entries:
+        if earliest < 0 or target.start < earliest:
+            earliest = target.start
+        if target.end > latest:
+            latest = target.end
+        total_cpu_time += target.Duration()
+        task_start_stop_times.append((target.start, 'start', target))
+        task_start_stop_times.append((target.end, 'stop', target))
+    length = latest - earliest
+    weighted_total = 0.0
+
+    # Sort by the time/type records and ignore |target|
+    task_start_stop_times.sort(key=lambda times: times[:2])
+    # Now we have all task start/stop times sorted by when they happen. If a
+    # task starts and stops on the same time stamp then the start will come
+    # first because of the alphabet, which is important for making this work
+    # correctly.
+    # Track the tasks which are currently running.
+    running_tasks = {}
+    # Record the time we have processed up to so we know how to calculate time
+    # deltas.
+    last_time = task_start_stop_times[0][0]
+    # Track the accumulated weighted time so that it can efficiently be added
+    # to individual tasks.
+    last_weighted_time = 0.0
+    # Scan all start/stop events.
+    for event in task_start_stop_times:
+        time, action_name, target = event
+        # Accumulate weighted time up to now.
+        num_running = len(running_tasks)
+        if num_running > 0:
+            # Update the total weighted time up to this moment.
+            last_weighted_time += (time - last_time) / float(num_running)
+        if action_name == 'start':
+            # Record the total weighted task time when this task starts.
+            running_tasks[target] = last_weighted_time
+        if action_name == 'stop':
+            # Record the change in the total weighted task time while this task
+            # ran.
+            weighted_duration = last_weighted_time - running_tasks[target]
+            target.SetWeightedDuration(weighted_duration)
+            weighted_total += weighted_duration
+            del running_tasks[target]
+        last_time = time
+    assert (len(running_tasks) == 0)
+
+    # Warn if the sum of weighted times is off by more than half a second.
+    if abs(length - weighted_total) > 500:
+        print('Warning: Possible corrupt ninja log, results may be '
+              'untrustworthy. Length = %.3f, weighted total = %.3f' %
+              (length, weighted_total))
+
+    # Print the slowest build steps:
+    print('    Longest build steps:')
+    if elapsed_time_sorting:
+        entries.sort(key=lambda x: x.Duration())
+    else:
+        entries.sort(key=lambda x: x.WeightedDuration())
+    for target in entries[-long_count:]:
+        print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
+              (target.WeightedDuration(), target.DescribeTargets(),
+               target.Duration()))
+
+    # Sum up the time by file extension/type of the output file
+    count_by_ext = {}
+    time_by_ext = {}
+    weighted_time_by_ext = {}
+    # Scan through all of the targets to build up per-extension statistics.
+    for target in entries:
+        extension = GetExtension(target, extra_step_types)
+        time_by_ext[extension] = time_by_ext.get(extension,
+                                                 0) + target.Duration()
+        weighted_time_by_ext[extension] = weighted_time_by_ext.get(
+            extension, 0) + target.WeightedDuration()
+        count_by_ext[extension] = count_by_ext.get(extension, 0) + 1
+
+    print('    Time by build-step type:')
+    # Copy to a list with extension name and total time swapped, to (time, ext)
+    if elapsed_time_sorting:
+        weighted_time_by_ext_sorted = sorted(
+            (y, x) for (x, y) in time_by_ext.items())
+    else:
+        weighted_time_by_ext_sorted = sorted(
+            (y, x) for (x, y) in weighted_time_by_ext.items())
+    # Print the slowest build target types:
+    for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]:
+        print(
+            '      %8.1f s weighted time to generate %d %s files '
+            '(%1.1f s elapsed time sum)' %
+            (time, count_by_ext[extension], extension, time_by_ext[extension]))
+
+    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
+          'parallelism)' %
+          (length, total_cpu_time, total_cpu_time * 1.0 / length))
+    print('    %d build steps completed, average of %1.2f/s' %
+          (len(entries), len(entries) / (length)))
+
+
+def main():
+    log_file = '.ninja_log'
+    metrics_file = 'siso_metrics.json'
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument(
+        '-s',
+        '--step-types',
+        help='semicolon separated fnmatch patterns for build-step grouping')
+    parser.add_argument(
+        '-e',
+        '--elapsed_time_sorting',
+        default=False,
+        action='store_true',
+        help='Sort output by elapsed time instead of weighted time')
+    parser.add_argument('--log-file',
+                        help="specific ninja log file to analyze.")
+    args, _extra_args = parser.parse_known_args()
+    if args.build_directory:
+        log_file = os.path.join(args.build_directory, log_file)
+        metrics_file = os.path.join(args.build_directory, metrics_file)
+    if args.log_file:
+        log_file = args.log_file
+    if not args.step_types:
+        # Offer a convenient way to add extra step types automatically,
+        # including when this script is run by autoninja. get() returns None if
+        # the variable isn't set.
+        args.step_types = os.environ.get('chromium_step_types')
+    if args.step_types:
+        # Make room for the extra build types.
+        global long_ext_count
+        long_ext_count += len(args.step_types.split(';'))
+
+    if os.path.exists(metrics_file):
+        # Automatically handle summarizing siso builds.
+        cmd = ['siso.bat' if 'win32' in sys.platform else 'siso']
+        cmd.extend(['metrics', 'summary'])
+        if args.build_directory:
+            cmd.extend(['-C', args.build_directory])
+        if args.step_types:
+            cmd.extend(['--step_types', args.step_types])
+        if args.elapsed_time_sorting:
+            cmd.append('--elapsed_time_sorting')
+        subprocess.run(cmd)
+    else:
+        try:
+            with open(log_file, 'r') as log:
+                entries = ReadTargets(log, False)
+                if entries:
+                    SummarizeEntries(entries, args.step_types,
+                                     args.elapsed_time_sorting)
+        except IOError:
+            print('Log file %r not found, no build summary created.' % log_file)
+            return errno.ENOENT
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/ci/nvrtc_libcudacxx.sh b/ci/nvrtc_libcudacxx.sh
index 4a0d9f6e89..a33fb14522 100755
--- a/ci/nvrtc_libcudacxx.sh
+++ b/ci/nvrtc_libcudacxx.sh
@@ -2,6 +2,8 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 
 PRESET="libcudacxx-nvrtc-cpp${CXX_STANDARD}"
 CMAKE_OPTIONS=""
diff --git a/ci/sccache_stats.sh b/ci/sccache_stats.sh
index 8abb4125c2..3a3ebc421c 100755
--- a/ci/sccache_stats.sh
+++ b/ci/sccache_stats.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # This script prints the sccache hit rate between two calls to sccache --show-stats.
-# It should be sourced in your script before and after the operations you want to profile, 
+# It should be sourced in your script before and after the operations you want to profile,
 # with the 'start' or 'end' argument respectively.
 
 mode=$1
@@ -12,6 +12,12 @@ if [[ "$mode" != "start" && "$mode" != "end" ]]; then
     exit 1
 fi
 
+# Check if sccache is available
+if ! command -v sccache &> /dev/null; then
+    echo "Notice: sccache is not available. Skipping..."
+    exit 0
+fi
+
 case $mode in
   start)
     export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
diff --git a/ci/test_cub.sh b/ci/test_cub.sh
index b379cc2cbf..9fd9feff48 100755
--- a/ci/test_cub.sh
+++ b/ci/test_cub.sh
@@ -2,8 +2,12 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 ./build_cub.sh "$@"
 
 PRESET="cub-cpp$CXX_STANDARD"
 
 test_preset CUB "${PRESET}"
+
+print_time_summary
diff --git a/ci/test_libcudacxx.sh b/ci/test_libcudacxx.sh
index c433199cc4..64736f430e 100755
--- a/ci/test_libcudacxx.sh
+++ b/ci/test_libcudacxx.sh
@@ -2,6 +2,8 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 PRESET="libcudacxx-cpp${CXX_STANDARD}"
 CMAKE_OPTIONS=""
 
@@ -18,3 +20,5 @@ test_preset "libcudacxx (CTest)" ${CTEST_PRESET}
 source "./sccache_stats.sh" "start"
 test_preset "libcudacxx (lit)" ${LIT_PRESET}
 source "./sccache_stats.sh" "end"
+
+print_time_summary
diff --git a/ci/test_thrust.sh b/ci/test_thrust.sh
index a2895f9aea..1385ef560e 100755
--- a/ci/test_thrust.sh
+++ b/ci/test_thrust.sh
@@ -2,8 +2,12 @@
 
 source "$(dirname "$0")/build_common.sh"
 
+print_environment_details
+
 ./build_thrust.sh "$@"
 
 PRESET="thrust-cpp$CXX_STANDARD"
 
 test_preset "Thrust" ${PRESET}
+
+print_time_summary

From 79f8f712af88756934f93dc9a1daaeedb3c5f612 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Thu, 30 Nov 2023 14:45:40 -0800
Subject: [PATCH 3/4] Setup documentation and corresponding github action
 (#1118)

* Make top level docs for CCCL.

* Fix image paths

* Add Thrust docs build and fix any issues

* Add CUB docs build

* Fix PTX docs interpretting liquid syntax

* Fixup libcudacxx baseurl in docs

* Fixup cub permission in gen_docs.bash

* Fixup thrust docs script permissions

* Fix favicon in libcudacxx

* Document `$TAG`.

* Document scripts better.
---
 .github/workflows/build-docs.yml              |  83 ++++++++++++
 cub/docs/gen_docs.bash                        |  30 +++++
 cub/docs/gen_docs.sh                          |  20 ---
 cub/docs/repo.toml                            |   2 +-
 docs/Dockerfile                               |  18 +++
 docs/build_docs.bash                          |  21 +++
 docs/jekyll/_config.yaml                      |  30 +++++
 docs/jekyll/_sass/color_schemes/nvidia.scss   | 125 ++++++++++++++++++
 docs/jekyll/favicon.ico                       | Bin 0 -> 25214 bytes
 docs/{ => jekyll}/images/codespaces.png       | Bin
 .../jekyll}/images/nvidia_logo.png            | Bin
 docs/{ => jekyll}/images/pr-checks.png        | Bin
 .../images/repro_instructions.png             | Bin
 docs/jekyll/index.md                          |   5 +
 docs/make_env.bash                            |  13 ++
 docs/readme.md                                |  23 ++++
 libcudacxx/docs/_config.yml                   |   4 +-
 libcudacxx/docs/extended_api/ptx.md           |   2 +-
 libcudacxx/docs/images/nvidia_logo.png        | Bin 0 -> 50546 bytes
 thrust/docs/build_docs_locally.bash           |  13 ++
 thrust/docs/doxybook/config.json              |   2 +-
 thrust/docs/github_pages/_config.yml          |   3 +
 .../_sass/color_schemes/nvidia.scss           |   1 +
 23 files changed, 371 insertions(+), 24 deletions(-)
 create mode 100644 .github/workflows/build-docs.yml
 create mode 100755 cub/docs/gen_docs.bash
 delete mode 100755 cub/docs/gen_docs.sh
 create mode 100644 docs/Dockerfile
 create mode 100644 docs/build_docs.bash
 create mode 100644 docs/jekyll/_config.yaml
 create mode 100644 docs/jekyll/_sass/color_schemes/nvidia.scss
 create mode 100644 docs/jekyll/favicon.ico
 rename docs/{ => jekyll}/images/codespaces.png (100%)
 rename {libcudacxx/docs/assets => docs/jekyll}/images/nvidia_logo.png (100%)
 rename docs/{ => jekyll}/images/pr-checks.png (100%)
 rename docs/{ => jekyll}/images/repro_instructions.png (100%)
 create mode 100644 docs/jekyll/index.md
 create mode 100644 docs/make_env.bash
 create mode 100644 docs/readme.md
 create mode 100644 libcudacxx/docs/images/nvidia_logo.png
 create mode 100755 thrust/docs/build_docs_locally.bash

diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
new file mode 100644
index 0000000000..b3eee11d28
--- /dev/null
+++ b/.github/workflows/build-docs.yml
@@ -0,0 +1,83 @@
+name: Deploy CCCL pages
+
+on:
+  # Runs on pushes targeting the default branch
+  push:
+    branches: ["main"]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  # Build job
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Setup Pages
+        uses: actions/configure-pages@v3
+
+      # Build helper image for Thrust/CUB
+      - name: Build helper image
+        run: |
+          bash ./docs/make_env.bash "cccl:docs"
+
+      # Build top level docs for CCCL
+      - name: Build landing page
+        uses: actions/jekyll-build-pages@v1
+        with:
+          source: ./docs/jekyll
+          destination: ./_site
+
+      # CUB
+      - name: Build CUB docs
+        run: |
+          bash ./docs/build_docs.bash "cccl:docs" /cccl/cub/docs/gen_docs.bash
+          sudo mkdir -p ./_site/cub
+          sudo cp -rf ./cub/docs/_build/docs/CUB/latest/* ./_site/cub
+
+      # Libcudacxx
+      - name: Build libcudacxx docs
+        uses: actions/jekyll-build-pages@v1
+        with:
+          source: ./libcudacxx/docs
+          destination: ./_site/libcudacxx
+
+      # Thrust
+      - name: Build Thrust markdown in Docker
+        run: bash ./docs/build_docs.bash "cccl:docs" /cccl/thrust/docs/build_docs_locally.bash
+
+      - name: Build Thrust docs
+        uses: actions/jekyll-build-pages@v1
+        with:
+          source: ./thrust/build_docs/github_pages
+          destination: ./_site/thrust
+
+      # Upload build artifacts
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v2
+
+  # Deployment job
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v2
diff --git a/cub/docs/gen_docs.bash b/cub/docs/gen_docs.bash
new file mode 100755
index 0000000000..34ba008425
--- /dev/null
+++ b/cub/docs/gen_docs.bash
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+## This script just wraps launching a docs build within a container
+## Tag is passed on as the first argument ${1}
+
+set -e
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+
+cd $SCRIPT_PATH
+
+## Clean image directory, without this any artifacts will prevent fetching
+rm -rf img
+mkdir -p img
+
+if [ ! -n "$(find img -name '*.png')" ]; then
+    wget -q https://docs.nvidia.com/cuda/_static/Logo_and_CUDA.png -O img/logo.png
+
+    # Parse files and collects unique names ending with .png
+    imgs=( $(grep -R -o -h '[[:alpha:][:digit:]_]*.png' ../cub | uniq) )
+    imgs+=( "cub_overview.png" "nested_composition.png" "tile.png" "blocked.png" "striped.png" )
+
+    for img in "${imgs[@]}"
+    do
+        echo ${img}
+        wget -q https://nvlabs.github.io/cub/${img} -O img/${img} || echo "!!! Failed to fetch $img"
+    done
+fi
+
+./repo.sh docs || echo "!!! There were errors while generating"
diff --git a/cub/docs/gen_docs.sh b/cub/docs/gen_docs.sh
deleted file mode 100755
index 34d28b881d..0000000000
--- a/cub/docs/gen_docs.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/env bash
-
-
-mkdir -p img
-
-if [ ! -n "$(find img -name '*.png')" ]; then
-    wget -q https://docs.nvidia.com/cuda/_static/Logo_and_CUDA.png -O img/logo.png
-
-    # Parse files and collects unique names ending with .png
-    imgs=$(grep -R -o -h '[[:alpha:][:digit:]_]*.png' ../cub)
-    imgs="${imgs}\ncub_overview.png\nnested_composition.png\ntile.png\nblocked.png\nstriped.png"
-
-    for img in $(echo -e ${imgs} | sort | uniq)
-    do 
-        echo ${img}
-        wget -q https://nvlabs.github.io/cub/${img} -O img/${img}
-    done  
-fi
-
-./repo.sh docs
diff --git a/cub/docs/repo.toml b/cub/docs/repo.toml
index 23bc013609..eed418d51e 100644
--- a/cub/docs/repo.toml
+++ b/cub/docs/repo.toml
@@ -102,7 +102,7 @@ doxygen_predefined = [
     "CUB_IGNORE_DEPRECATED_CPP_DIALECT"
 ]
 
-# make sure to use ./fetch_imgs.sh 
+# make sure to use ./fetch_imgs.sh
 doxygen_conf_extra = """
   IMAGE_PATH             = ${config_root}/img
   DOXYFILE_ENCODING      = UTF-8
diff --git a/docs/Dockerfile b/docs/Dockerfile
new file mode 100644
index 0000000000..f628b1b7f8
--- /dev/null
+++ b/docs/Dockerfile
@@ -0,0 +1,18 @@
+FROM ubuntu:22.04
+
+SHELL [ "/usr/bin/env", "/bin/bash", "-c" ]
+
+ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
+
+RUN apt-get -y -qq update; \
+    apt-get -y -qq upgrade; \
+    apt-get -y -qq install doxygen unzip wget
+
+RUN mkdir -p /opt/doxybook2; \
+    cd /opt/doxybook2; \
+    wget -q -O doxybook2.zip "https://github.com/matusnovak/doxybook2/releases/download/v1.5.0/doxybook2-linux-amd64-v1.5.0.zip"; \
+    unzip doxybook2.zip
+
+ENV PATH "$PATH:/opt/doxybook2/bin"
+
+SHELL [ "/bin/bash" ]
diff --git a/docs/build_docs.bash b/docs/build_docs.bash
new file mode 100644
index 0000000000..569f70c484
--- /dev/null
+++ b/docs/build_docs.bash
@@ -0,0 +1,21 @@
+#!/usr/bin/env sh
+
+## This script just wraps launching a docs build within a container
+## Tag is passed on as the first argument ${1}
+
+set -ex
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+cd $SCRIPT_PATH
+
+CCCL_ROOT=$(realpath $SCRIPT_PATH/..)
+
+TAG=${1}
+shift
+
+(
+    docker run --rm \
+        --mount type=bind,src=${CCCL_ROOT},dst=/cccl \
+        $TAG \
+            bash -c "$@"
+)
diff --git a/docs/jekyll/_config.yaml b/docs/jekyll/_config.yaml
new file mode 100644
index 0000000000..bc149cf4af
--- /dev/null
+++ b/docs/jekyll/_config.yaml
@@ -0,0 +1,30 @@
+title: CUDA C++ Core Libraries
+
+repository: nvidia/cccl
+
+remote_theme: pmarsceill/just-the-docs
+
+color_scheme: nvidia
+logo: /images/nvidia_logo.png
+
+search_enabled: true
+search.heading_level: 4
+
+# just-the-docs ignores these filenames by default.
+include: [ "contributing.md", "code_of_conduct.md" ]
+
+plugins_dir:
+  - jekyll-remote-theme
+  - jekyll-optional-front-matter # GitHub Pages.
+  - jekyll-default-layout        # GitHub Pages.
+  - jekyll-titles-from-headings  # GitHub Pages.
+  - jekyll-relative-links        # GitHub Pages.
+
+defaults:
+  -
+    scope:
+      path: index.md
+    values:
+      title: index
+      nav_order: 0
+      permalink: /
diff --git a/docs/jekyll/_sass/color_schemes/nvidia.scss b/docs/jekyll/_sass/color_schemes/nvidia.scss
new file mode 100644
index 0000000000..6bd1ddcbbf
--- /dev/null
+++ b/docs/jekyll/_sass/color_schemes/nvidia.scss
@@ -0,0 +1,125 @@
+$body-line-height: 1.4;
+$content-line-height: 1.4;
+.highlight { line-height: 1.0 !important; }
+
+/* h1 size. We make this smaller so the README title fits on one line. */
+$font-size-9: 30px;
+
+/* Inline code. */
+code,
+code.highlighter-rouge
+{ font-size: 0.85em !important; }
+
+/* Code blocks. */
+pre.highlight code
+{ font-size: 0.9em !important; }
+
+$nav-width: 300px;
+$content-width: 1000px;
+
+$body-background-color: $grey-dk-300;
+$sidebar-color: $grey-dk-300;
+$border-color: $grey-dk-200;
+
+$body-text-color: $grey-lt-300;
+$body-heading-color: $grey-lt-000;
+$nav-child-link-color: $grey-dk-000;
+$search-result-preview-color: $grey-dk-000;
+
+$link-color: #76b900;
+$btn-primary-color: #76b900;
+$base-button-color: $grey-dk-250;
+
+$code-background-color: $grey-dk-250;
+$search-background-color: $grey-dk-250;
+$table-background-color: $grey-dk-250;
+$feedback-color: darken($sidebar-color, 3%);
+
+div.highlighter-rouge,
+pre.highlight code
+{ background-color: #111 !important; }
+
+.highlight span.err { color: #ff0000; font-weight: bold; } /* Error */
+
+.highlight span.ow, /* Operator.Word */
+.highlight span.k,  /* Keyword */
+.highlight span.kc, /* Keyword.Constant */
+.highlight span.kd, /* Keyword.Declaration */
+.highlight span.kp, /* Keyword.Pseudo */
+.highlight span.kr, /* Keyword.Reserved */
+.highlight span.bp, /* Name.Builtin.Pseudo */
+.highlight span.vc, /* Name.Variable.Class */
+.highlight span.vg, /* Name.Variable.Global */
+.highlight span.vi  /* Name.Variable.Instance */
+{ color: #76b900; font-weight: bold; }
+
+.highlight span.n,  /* Name */
+.highlight span.h,  /* Name */
+.highlight span.na, /* Name.Attribute */
+.highlight span.nb, /* Name.Builtin */
+.highlight span.nc, /* Name.Class */
+.highlight span.no, /* Name.Constant */
+.highlight span.nd, /* Name.Decorator */
+.highlight span.ni, /* Name.Entity */
+.highlight span.ne, /* Name.Exception */
+.highlight span.nf, /* Name.Function */
+.highlight span.nl, /* Name.Label */
+.highlight span.nn, /* Name.Namespace */
+.highlight span.nx, /* Name.Other */
+.highlight span.py, /* Name.Property */
+.highlight span.nt, /* Name.Tag */
+.highlight span.nv, /* Name.Variable */
+.highlight span.kt  /* Keyword.Type */
+{ color: $grey-lt-300 }
+
+.highlight span.c,  /* Comment */
+.highlight span.cm, /* Comment.Multiline */
+.highlight span.c1, /* Comment.Single */
+.highlight span.cs  /* Comment.Special */
+{ color: #009966; font-style: italic }
+
+.highlight span.cp  /* Preprocessor */
+.highlight span.kn, /* Keyword.Namespace */
+{ color: $grey-dk-000 }
+
+.highlight span.o, /* Operator */
+.highlight span.p  /* Punctuation */
+{ color: #00ff00 }
+
+.highlight span.ge { font-style: italic } /* Generic.Emph */
+
+.highlight span.gs { font-weight: bold } /* Generic.Strong */
+
+.highlight span.l,  /* Literal */
+.highlight span.ld, /* Literal.Date */
+.highlight span.m,  /* Literal.Number */
+.highlight span.mf, /* Literal.Number.Float */
+.highlight span.mh, /* Literal.Number.Hex */
+.highlight span.mi, /* Literal.Number.Integer */
+.highlight span.mo, /* Literal.Number.Oct */
+.highlight span.il, /* Literal.Number.Integer.Long */
+.highlight span.s,  /* Literal.String */
+.highlight span.sb, /* Literal.String.Backtick */
+.highlight span.sc, /* Literal.String.Char */
+.highlight span.sd, /* Literal.String.Doc */
+.highlight span.s2, /* Literal.String.Double */
+.highlight span.se, /* Literal.String.Escape */
+.highlight span.sh, /* Literal.String.Heredoc */
+.highlight span.si, /* Literal.String.Interpol */
+.highlight span.sx, /* Literal.String.Other */
+.highlight span.sr, /* Literal.String.Regex */
+.highlight span.s1, /* Literal.String.Single */
+.highlight span.ss  /* Literal.String.Symbol */
+{ color: #119911 }
+
+.highlight span.w { color: #00cc00 } /* Text.Whitespace */
+
+.highlight span.gh, /* Generic.Heading */
+.highlight span.gp, /* Generic.Prompt */
+.highlight span.gu  /* Generic.Subheading */
+{ color: #00ff00; font-weight: bold }
+
+.highlight span.gd { color: #ff0000 } /* Generic.Deleted */
+.highlight span.gi { color: #00ff00 } /* Generic.Inserted */
+
+.search-input { color: $body-text-color; }
diff --git a/docs/jekyll/favicon.ico b/docs/jekyll/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..424df87200c706460f9ad1c7722ef0d35f286f2b
GIT binary patch
literal 25214
zcmeHP33MFAnf_ZXd&ZV!naGC>!X8`L#s@^339*=AG3F%%2(Td`0YYHKZfwp41JlG<
zfZ7Zop2Zoiut~6CAT|+s5lq4~av)>Gz1)X6mVj6i!b{kVI1-e^So?idUDG|%Sh8iv
zVs^^<rmFt>ullRHy1IHiB9@Gi#>NTAE9Jz|BG-vXET-#kRfuc`ZNday-`x^<bBusr
zz#o-$<3+|-k>9UJMedqPeqBFtl*n;24S!75%@TQVj^Rfge5W01_+xTnrO4#tM0SS{
zFd@wV{bF);rpm=1E*e2`U7g_*MZlGOK97ce^(X!PECKjO5<fmADcaKJ1%Ga#S9T^5
zO38rUm5gT#e+vV8GQ^%J@TWNz(SpD_%^!_-tWl+CMx_qTA5A0|Mw>j<(oQS!wW=KN
zXcRk>iRV~@NJl2ik`1y{wYPwv8nW?36ASS;$ZJr@#xtx(JK|)pVKb7UKhl*{2JH^|
zPxfRo63@6IiG*q=KiTb!Tb0S8A(Bm?Rq?0eosw-5I}y)<YehCGb{F{*U2tuUjLpK)
zMu{d8Etz<7q1fbwH9MY-cEFvg_?<~QY=d`I0(Io+<QmAwGs$?9v_O423yVn@%uui^
ziw=VqN>R7LDp|y3Y&=eWNh&1^J3+QH_*CNnxv^}dM=*(!ok?QU7|(WcVWzhy*e5+~
zDzYj9Rq;$5<W-3z)MpVIq%0)t44gtw;%b;xoY|0EMusX7v((QxGo@)4F#<{PFC<YV
z*j<t2+*110Fep0gCdr;eXEPapo}dMGGh~m)<1nH9p#b?gMLIGaaLM-v^k{qgs`ll}
zTiV*&+Prp`te?t+HApQ(BhOut2-h;?c8PpbtE=cet$iwY)BP%sxQlwgzj?Y_(}#T1
zzOD9V<bBIq^ZjX&-rkkHt=%H8zwR!dj{1gfcc1ZS?{0IaQ9gae1vM`6wwks`9(g3v
z)=hnp^lFuN_q4W=-qUx!;_Gc)HeFEPG~GRyd4FW3_9x%kqw2lBpZ%DLY~NCUNLT&c
z-neOb-e+RKet@~p2<~~MlgkT^Hx`}Y6U9%?eI|;lOD8yq6zNL%|8x}#Ml_4nK(7aO
zqQwPWt=-7&1d3c@B+Io(5}oK?G{YrDtzS7wuk%1?BmvEh(GD#1Sok`j6bq2pja6K^
z(`yCmwY^rT&%@dj?}EN`Cpe-BX~!~(OCI8K=dvo^53n33zzUOHPz}B`RxB(G@WhpL
z2RoteLn~k>)5Qfgn}vOmvyxmF(pdT8^cUS-gDO`f|2`y$M@KTAp?;OBI<N%ccEnqS
zN@umVw5(`f*VXRVKfX*~-z7dV`}>L2uDv<ZXZQ6ivfp{Nruz-M?Y#$MufOp`b7S|V
zJ@t0|8{_Tz6E3%}m|kzc@k*2ZPEW7B30GeKYG0l@dVwt@&hrUC3^4QDMY(ix8GBA_
zo#7M3?Vab0J4P7ex&kBJBG^-;J0qPf%Ojn##<nrx(&@$~TRPL2Q?`4FG}@RYRS-yI
zIq4!@am-2H=yTa+$V(=@ujBfGwFZ0D_cn+;c8186KN7ipvB(}XM6Nzw<ZaM5Tbd^<
z!#L4cGr;|=O4#of5@gVzl9fw|c#c$hXG`Sn<uc~(b7Wjsi`1+?S89HFp6s+9_~rRB
z5tw|BD?8t_QfA~Ymf0IFk-7J+l0CaGlRfUgO#b14%jF9XUL|`!_&wS8;p=4oN3NIs
zAN{_3>CqeHz{h?d2R-&fIrOm`<?tug$Ty$7S&n?_7Wvjwx5}|k|5%!zStm=L{fV6L
z-0gDmb3c`T+xRm%ePgFA+jxhhUdYPXFaBK4dFd`W?`5yhg>s=BfpP>!9sw+|BNr}9
zemoIaFni0Xl*Q_biVCafFR0a(5ZG4!n(FF^wS_cgp#FwhY-x3h)l~XkZONb65jt-8
znKz85NNN7L^rAW@^%E!6#R8%EH6h<Z{?t@hRxR25{zigW#FSlH31|_k!uPyx*IlPT
z2F=w4EAt|{D+PWiAE#O;oBocc8-c`Z<(2UV+Emj-bgrPZrr3j8wE6nspHho_qVhj7
zO-a#jZI;z>hQ9{RkpL$f3jLi;3UpRjb6HY!t)H&Z5In9YU3uyUYgbm-SgWYYf9T*K
z&92a0R`G;wU7HV=tlE0o!5<|jqc9UWoW-a1r$)%3T?aq@0)83-+dNP;zb?!L*%wf*
zs1Nv457`<p&Vh#c6o5avhn2bkyBe)r`$Gqn;sqFD`d4ith`O2}#r$yGc!C&I)KWwx
zkTrYgYS810Nn4>@LErsOsQymUa;CG|AJSqVnW;hF7fhnZYQ$iDFuxkBh#5y64(&@U
zhz(0dDRhp*gx;0LI9}saiE3!5C_ZG-iH(sOP6<ENk$KDf=m+PH;htFRimHYDF>DJ^
zSP-iXIT0!rf^E2woB3HneYn2`2{k``&xi3}$!shC+*rfR1`R(m)tR{Z%4r9SfPT#6
za~9J`au-X+5a&GQbE_faT6}KhKc`lPCJ`4Hv&irn_3rP=Xg;HM;Td&(i;OqVsC+)X
zr&T?lPQCX6nU=p$W^QPcIe0$Zy?d3+@4nQ3M!iBl_rR617oJm_9=clgdH7n{56`KK
z@tk_VfBd_8PCfXsAIV{l-zZ;y@+Lf|X5=V5rylprTKQ)@qaF_|#WU(jct$-H&#0&2
z8TE`8?v&JvS^4fuzmVlX`584de&u300^1n@wSa7=j!%$y!{Dt<k$1DAqXu$>+Uu)y
zW!~fvSD`&>jhZ@lo~h1=`igwQL<?`RMcI$WG)-^Ybh*mf357YVG(fq(6(omhSIzUR
zQM2dJz<h8hGL~mmm}p`3Zh#8vACIO<mW3DSP-v@m#bu54?OV_>OKtJTYW<^VJW5W;
z*H5!7ykLiBofNOCt9JBazC{g&L1c<jKbcAz)DC|VP03D1Ua(b0UB>zkY4xfZM3vYg
zqpTUsxQm-L%K8j4coAyHre4LK{Zj>BIjWRnn#%Cc)%(C%wl5rN2g_uAkDC@6y+B?m
z8G8IxC<4v8q~r=8QwF6%(+*qU?yO|`KY8!q?^6onj<B+9>e#1@3hh&>u}7JJbMz_q
zu9P}pdj2Ab;rx8J`*42l?@bomf2kxMzDD+a<T{*}>wU;qun#!|`;en?Mt=OWx5>%R
z2YZe)vFC7bCf@RLZigMyvg73le3~MF^=X9NsWc0^lAoYXj?~=+^;+E0rtZX>S$!oI
z7*wmLPS{0iNkP3*aI|Puk3s7sv{%>4Xuct>tCBi(<EgHfI-IMF;muXwXeOY2X9&`!
zvWJh!{;8H<A8V3n-|9U44s!e7TgH@Kms^2yDo3CkfpP@O5hzFCql!Q-b%EqkSN$Dv
z>PO|e(vwU52H)H1FCdrNJfiZ}%iUA<-;Y2pRe^c)67Z&vUgVpA%^xF19)HiN!Q40k
zzTJTQDd0VnV@2bcOI3ls8*mWX&O&|*@FL28L9XXksku^$#(6oIzU}^d&MfG;7WV%L
z4Afcde^F;bm0W6P$Q}WFAG%&ezNtvpU_bRWHJHBj{^wFPuyq;mCm=j0IA3lBP6M*Q
zBfvwzdf;ZD1vn6x80tY#%%v*Pwij?Q!1?zf5Y|6bdF%Z5oF?deQN<{T`ID$03A%p(
z373Xmf8GUu4mbvU=B0bk50-MN(P&?c{8r?D0tSxn!1Zi+|8uFMq33<*G_rq2IRzS?
zi$ckBYC-ovzXSLwkVpNu$Oq1I#*^#pa)9TWrIY7ONBb)9_97qI{K=)3K>lxl8N;_w
z_P_KCv$>QF91YM{_A6H8OfEGFwE4ifz^j1KWAfjCb}UeM@GFvMC6}5CTnqdaFmfi}
za{N8#o9MUpU614MQ0MD@omlpqN0m+;AI{swpnVNE2G}2U9zg5FdG0yi23`)ynEBBK
z`V7NbGWVR%L+)wtg~y__|DLlK_}){zdfssUgyVhy+Vr^U_J>iYznm{xzwYz<sILR!
z1)n@;Ecng`ZQ%TV8}uVVA8IUOX#XF;^n0-Xj2YKx<A?Sy*k|Z#nb#uy<T(BSIgSI6
zwtW3^?5_b|FyGlH&v_Pfb6wFVuA^5VH(!I&-R8CCIo|<2h>w;l<^T0cC)+jFBiG*r
z%B4Po@~=bfohb7ihEDW9kTde({AslNUz~x6UXyPtAw!!J0Sn;0?{Ofk$CQix_nZXy
z{))buIl%cJp7Y;RZPevCe*lWV;Zi#O;qj(kt^s?SE~Curq7BgNelRySg2wphds{%)
zzG)u<|2?My{P%%B7*Ef+Ib^2+?cq7QywGL}gZ+OAGP?l0uLXK@DfTJo1LwTwoK^6(
zm^UmpupIcGOC6wWP*#I`uRV=xC|?q4=X};*nJO@#P&|J)b_c0uz0O{ax{k5d{|D5A
zxkScXY8kLCd%nPbeQz|^HOGjt3%>b~OI?Y&?!S)D#YOGfKJQDECEu0G_wa}3oD6y&
zd^B<7et}<~3zFxYA8HT#Q>q;H|03wteGhy%qo7#NQ#WLvK6Jqr*IHZg>G|_})XxKg
zz8fXnhk6e0bH@KCQ4ZE5@_EkJK^tn$m!ALnT3-rTy;sm<%sFs^F|YDm>K{Q1uLtDW
z2zUVZX~q6fNbBbM7uNke>bnfk{YB8;1<V{CE)U{w6y;J$#NZXc^z&~hUj&^&Jh^6a
z@20PJwj17<QB%l0=d)<P1M(k+_#QyHc<u6>gTWKNCmNk*tcIRH1%oJXFXlNJK=)tg
z_n^L8m<@HV&(o0~2k^f0ugDJtjsjfN>8I|Wo-^#jHK6mbJV>6i6!Z^3*S?yZaT&Ti
zIR1fFj{70ND@q6T=)HeCa-6r9NX6^Vz_I6?TU;c?xt2?<0S5AJZGC9}X-q!@o`am%
zr5RJ+2i5{#1APke@b#;C^q6s-{~2&7_%x5!$$JgQN#Dx@y&rA-!@=VH+;h$WUV|^;
zIr#x-F93IeemZilk$h%Yh<fq0tnZQgg3jjzeLWQG`uO}eeaWT9z&_)73&3Z`@L0zT
zTOrqLPYisg01pG(a(@czE>#|W{uCT3QQ*Eam*V=;3|xTn8Ge~#!1^u7xsM6ovxd7S
zhi?zA>+SP@V3&rzeoMt~duboN|K?f@Vq~uQ@HI1ZxzsgLT3^}!@EppexW`=mcf{FY
z^W}0m0_6ylBT$Y&IRZOc1oU@7NAmx5C^^Lc*Wp*V@Z{3p+obdSB7DOXbrJP@FPkk_
z{SM5gh}gZL_tiDYCgi=1wzMMeX_h>HXC|L_8~Hmk`IT-R<yWTNam?+AJ6h)#*e>gK
z+sZt1(YckMrs(<ReP99lnq)llz8V+5ID+4K5Vx5*e*DH&d43-XOMjp5_fWq#Lq3(Z
zKi?hZV!(cYrAqi+4Sch#oXQc{VI#oveEn?7=hN*4e|}}#65mgUpU1YZeEzl>2RF~k
z^!p~h3k#n4M&`%x_W6Fvb9jc=3A_oMRPfnz_+5l~z&8NC@3;^6Ghoh}hRe5X-*Y0+
zb2h+tGStKV-VIy=@SMFBSOV+~j0yQffBy+s1N=8Iln<rt^SveCgKF7#kS_=LzfBnb
zQsd7vwVQz8{fXus2ima!b(I>c!S?k#SjL%tKLhCVRnM7=@@2rOz;s{^a5!)SpwD>q
z`Bx70dCFJwq2J|v8T7}2a6F3bYrVhqd7ehjGhZ9v*z(MlXN*??mjfJwgMm>%4X_g6
zyH)D%1I`74^KBgu>T&?{J}$7Y$NLeqQJ(Q^0Qe4SH9#6;F&h{d{}(|!P<2(eKZrW{
z37$^}XX2i-6Y7kiIbRRkZw1}RE%Ei802;^bEkOTYtHJgu!*?4O0z5+<4O|D9@$Nx+
zFW?k_dW;R`+P>$+P-bjNdk&x;#?;LHrDiBW3-|vQsNV)qj`rBbm=P-geIBa)Gyasa
ze?52&{a^gPFZF5cTR4WsKIhyFfVR?rvDFsxmDg`6yT+F3KgY??T7CO<pmqCn&c!Bx
zwrKyqfG>sY@O|q;0Ch0N`d2>m+~hg3(LFFX_PLI&1?VgLai;GB$MP)T_dfmC$a(z*
z-&66Nsi<=tsGEHHdj*{PdhQP2e;U($I|;d2*Vmw|cWa)*b>wK^hrk)Y=Yd0ks{wQE
zKY_CT7QSAWZ%2J7+w?b>|0bAvOz%V+<G@(5|H1d1isv8ozv=Vnn0U_TQ0E*T80%8?
z!S(|`7D0yx@LJ_P<zArO2iLq-;LCviUW4cCiZbKz7oe1_QnW4G5B!)7eY}=_1-uQM
zteW(7J`?q00LGd=ma;JvZFu{}59WGKv5#CAxL$CKhSEP=+qT))KJfY*PVd&XkHr3&
zu>Y%%2*-uPTP~L)@c%Ia`n+MlXOCF@J~odNt52G(-dMfW<3_B!SQ}i~Z^MN#94j=7
z6>T18MeRzfa&**MDHE)o<{0>0YjYGjz-;skuu!P-oPsyH7+}sR_AA~wqU95sS`Hg8
z?MF2Z3-|ES->>aF{VQ0@J^+TkSKoHZqCx~NI)eM4Zvz_uzK>lD>;>!%aDT8KFnesh
z7905+mvFE1bD#^@1NaVbm0y1ic?&QW*crGGco*mw%3u5q+RXyIFYgIF4^Z}gU={EM
zfX^SVs4^Y)oCAOlmArmldmr^_z_Y-&fI|TK$bA~^?+(x|W5PX|zMs>+-d9lnbl{J`
zfdJ+CosuZPd*gose9pKApk58i-<4v2Hv-hpdpG&7^UEh7?*YCFd=~f}P%O{>+_aQ-
z&IQ<K-iuBFXrIp&vw+h9_A!uU*~FhZm{u<t1D?YG#(M>DBk&ebENAKi<+T=(AD^^8
nWL_H<x1F-ki0r7j`G2c@=m3!iT)_NvE{jYq&;yXi$T0t3@vakD

literal 0
HcmV?d00001

diff --git a/docs/images/codespaces.png b/docs/jekyll/images/codespaces.png
similarity index 100%
rename from docs/images/codespaces.png
rename to docs/jekyll/images/codespaces.png
diff --git a/libcudacxx/docs/assets/images/nvidia_logo.png b/docs/jekyll/images/nvidia_logo.png
similarity index 100%
rename from libcudacxx/docs/assets/images/nvidia_logo.png
rename to docs/jekyll/images/nvidia_logo.png
diff --git a/docs/images/pr-checks.png b/docs/jekyll/images/pr-checks.png
similarity index 100%
rename from docs/images/pr-checks.png
rename to docs/jekyll/images/pr-checks.png
diff --git a/docs/images/repro_instructions.png b/docs/jekyll/images/repro_instructions.png
similarity index 100%
rename from docs/images/repro_instructions.png
rename to docs/jekyll/images/repro_instructions.png
diff --git a/docs/jekyll/index.md b/docs/jekyll/index.md
new file mode 100644
index 0000000000..a1c50c42d8
--- /dev/null
+++ b/docs/jekyll/index.md
@@ -0,0 +1,5 @@
+# CUDA C++ Core Libraries (CCCL)
+
+- [Thrust](thrust)
+- [CUB](cub)
+- [libcudacxx](libcudacxx)
\ No newline at end of file
diff --git a/docs/make_env.bash b/docs/make_env.bash
new file mode 100644
index 0000000000..4abf9248cf
--- /dev/null
+++ b/docs/make_env.bash
@@ -0,0 +1,13 @@
+#!/usr/bin/env sh
+
+## This script just wraps launching a docs build within a container
+## Tag is passed on as the first argument ${1}
+
+set -ex
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+cd $SCRIPT_PATH
+
+TAG=${1}
+
+docker build -f ./Dockerfile -t $TAG .
diff --git a/docs/readme.md b/docs/readme.md
new file mode 100644
index 0000000000..899b17bf56
--- /dev/null
+++ b/docs/readme.md
@@ -0,0 +1,23 @@
+# Building docs
+
+Most docs can be *prepared* locally, but the build process is partially invested in the github workflows. This script accepts a tag name and runs a given script inside the container. This takes care of dependencies and system config for the user.
+
+## First steps
+
+Prepare docker container for local builds. `$TAG` is the image name you want to use. CCCL uses `cccl:docs` for its local tag name in the action.
+
+```bash
+make_env.bash $TAG
+```
+
+## Build Thrust
+
+```bash
+build_docs.bash $TAG /cccl/thrust/docs/build_docs_locally.bash
+```
+
+## Build CUB
+
+```bash
+build_docs.bash $TAG /cccl/cub/docs/gen_docs.bash
+```
diff --git a/libcudacxx/docs/_config.yml b/libcudacxx/docs/_config.yml
index 9bc161235b..32647a50eb 100644
--- a/libcudacxx/docs/_config.yml
+++ b/libcudacxx/docs/_config.yml
@@ -5,7 +5,7 @@ repository: nvidia/libcudacxx
 remote_theme: pmarsceill/just-the-docs
 
 color_scheme: nvidia
-logo: /assets/images/nvidia_logo.png
+logo: /images/nvidia_logo.png
 
 search_enabled: true
 search.heading_level: 4
@@ -13,6 +13,8 @@ search.heading_level: 4
 # just-the-docs ignores these filenames by default.
 include: [ "contributing.md", "code_of_conduct.md" ]
 
+baseurl: "cccl/libcudacxx"
+
 plugins_dir:
   - jekyll-remote-theme
   - jekyll-optional-front-matter # GitHub Pages.
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index f3d186b4e1..7015d05467 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -397,7 +397,7 @@ __global__ void __cluster_dims__(8, 1, 1) kernel()
   // Print received values:
   if (threadIdx.x == 0) {
     printf(
-      "[block %d] receive_buffer = {%d, %d, %d, %d}\n",
+      "[block %d] receive_buffer = { %d, %d, %d, %d }\n",
       cluster.block_rank(),
       receive_buffer[0], receive_buffer[1], receive_buffer[2], receive_buffer[3]
     );
diff --git a/libcudacxx/docs/images/nvidia_logo.png b/libcudacxx/docs/images/nvidia_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..6b005a283ba6b7299a08cda1d37ceac8f693f535
GIT binary patch
literal 50546
zcmeFZc|6qp_dom^CR<rc6td+KvX$&>B_S$BWKTk5o3U?0!d0QgQe;a?B}=kzlZr~3
zk!;zC8IdLXexK>{{e16!_kZ`l_npUeJ?b(uXI`(fJkL4LbKcj#r;PPjS$4A^gjn^D
zA2mUUJqw{NUQ7(|o8SW8CHRBc^SA{LA+c?=e=&DBw+<sDg!GT<ob~@U@fw4Hw-9b^
z6ihPFr2PH)tAW27_^W}x8u+V$zZ&?ffxjB~tAW27_^W}x8u+V$zZ&@ev<3)WRjED7
ziThA?GWF;4`$eKBtJEF!_UtOSsl-&<5NW*D+8Z^*|H0(qyW1x2Jmk?~y8lnB<L~AE
zYT&O1{%YW_2L5W`uLk~KYv5}^5f&l0cW=%l#3)-bqs+tcAw2I6{NHj48SD1bW)FY<
z6da}g`JeFjy_~B5kKgD=vi+aG(dNpc|380o>#vyqQOIB2{I6&H^_%~y;%`j-R}=rg
z1&T2(Lxi+-qhkCk2J6DE$Fv{#6GBfNq@k|g;`9#3k=BCn#o=!qbG<)0+~zks8b-{m
z-~S1~#K(t{V1Vp;p7|3gV|k@kZB*sQ$I5@(Fqty(L4mn{ytVNX7f8)`d0=GK&9rt(
z^z=7B`mq06g7)VU6YXoC?|fa{89;A#c>cg@XvxfiCpz=>&pfeo0NzD>{crO)*-!rH
z+kqgMYya;L(eGa}2C-d`m#~Zp7P`Uw!*OKZ&insJAv7E~CHz`t)7!;+#qDyXoWTEn
z8lizqTI%4wckN9CuGUfhQ1AHGKVK!#UR67o>g{fF$WMN2>x~YfKVH^er@hSQ#B#{8
z_4m<y{+y_v5~rQJAts<JJJDs#rtyytA~mtYjA+UJg7sy*uz{BDQCFWoUoAC)BYQeM
z#Kk%9%xQdc=GBbaF`k@14d!->NdOHrV9Ta$`rmPIzL{`(bRuldj<y8w-yZ@i2c*d!
zLhepYcSW%pe`ol-F<Y74e|G`?`#wY@ZHG5ng?sMIAFfZpYn}VwSKu#xdua!KXyNbv
zO{M3(n?YrU${#Ni)Hvaz!4XeCP2(<|`Eo<(|4^+jO*@-H>d`+rAN_gwo;WzXo&C;N
zEF^nI?ARajPzOyfF@A#;p4`4CZvEj)XgWj$j|K{{&rX%;-m(!HQds}=r_5;{+RL7A
zb3N6e>e%zA%xQVrOB_d^5TADltq)3kb2+JO$^FMW2ukpV%=7iQ0gn}#s2E<6hbMGa
zx<dc-K_NJnq{B=3Y3^d7^?A(B1&%*HKTrr4B~CJv!d-t<?VC{)=<WNHDdDGGxATR|
z`5B2&y@%-Bq0~5=6MBq)2#7lh_MW)@;Vto``3XI(=xEzNUPRn9^HkZf$QN!Y9-}UZ
z{qb7nc{sUU@d<v0Qa43`nxk(&*lhXZ<*$c9u3Ow#Ju&@+d?lmPA1|Rcns=lM^<fwC
zr!K_A*nev^@W01^JdX@3$C+XBFU7<p7ldAERk$kl9W|z1;QrCa_|x%hxkt(1BI|;9
zqj7Dn%kF-yRf)Lmgkj!=01z!CvKugFZEw~;h9MMw?jU|-8aPq4FDcxWGwV%l9L6<c
zN9jQMn<NFJVGWfDOSfO@mkTH-Y_7)L;QskUY%QI>W7;+H_JDHKIA-a?`is~YC#mod
z9z+O|V-`T=RW3gQ_(EIg&$?e{ZF)78_U24G1`WiR^>6Yvdi|U|vj6eRs=?@3UaMBR
zAo<=Hqb&~@F@=e)OC@!S@swDr>s{~05mAF=F;HCa{K77T%xOiW-$5g+<~2UE@T+50
zydoDkvcaV`$ub2zW*6rDlwRH_F2Bpu@zRH{v8eN8?u7rx4b~(?5UAP_WqrLPxuw!v
z04=4qb9rv<6K6nnv0&!%3JJ}{IirE;x8V9j?&_d;N6x1tkMn+SRL`|47+C}e(fO!Y
z&|TIrdub^PvgkAF_B+hmd%5A8*%9NRIENyZo?DCmV$jl6nl$EcF`{FE{K_laBu|I^
z2zd(3y53l39pCjN)$X;@w!xa(l;u9U)S(Ak(1x<Dhpg3XUDYIexGMVkS>ssGTs4lr
zv3+?jLO5CkJbOz)8LfV&MIOeqU=V@tqibxJ!np8biOV9gnpClB%DF>IJSHE;JLOrC
z@|;gtHTzfVReEjR1~0c4bbe%G%58)=I5}xWyeBLB$<uq%FWiiwcpj7_KAE>psJ-z|
z-$_YugYrbh(EdlEd-@Z`vP;zWGw)<8mL>M3?bqS1)?-0sRSSOW>}b`oe2WeEV^11f
zb1BIpcrsR;AoW}ST2%EA^^;0DT2&Hhn_X3R_v-mdZOtS#y(&O@J)BPl<wxR_MJ_nz
zVl)nl&N^+6RdW6;e^)=}PHm`Wy^eCsjYdx#diB}QOz<TaBIZwR_Fo+{phu>L2E;vi
zTQSIgJj?(#rM`|Vtd~wa+Ksx)(;ebsmc4JN@Y6v|#@$W|Q%^P#*%Mya5%a!P!Et2Q
zmm7X(GtFd-#-e=~5dTns#TkvdfX%Hz2%Tcgvh!Vd1Z_`n_I6b??VfD;s7ZQ~(~BL0
zCs<Q&0)Cy{`;L_M1+UL{(k!Aj;PKc%(Dj8b#w`TZwQthvwG;H_I1JINzqv5^VlN{i
zDyf1~Za$h-Nxb=HdHZHb*p*s}Ki)H_AXr1qtyvy_3{EIHS88}ZE~djJ<&xuAhk{W>
z!b6Ub*1@RJx@2b?Gdh%Oa(-i`x|*kU|J1(=7-Z*APvkG(X!;2L2qz_GFsZ&Qu3p1k
zo}1d#>a|EB%iYVy#Kia)DcTw7(s4LLY@TdQNPJi|B{Y?tmiqXE$NN{xN;xs*&m&dd
ztMN5tl?Zyq@W(gx|D+!Xx_PLkoJv<qa#|#7Gfg!j#D7#AJWUJBXVF#lrmLybFm^*F
zBW#uuuwf{TH>DWnQ|`%@y@%gR0z#f$N?)r{OQFxUUS5zfcw!)-?-|J;CPuBl!G*Mo
zU5j>T`RVf<XC(O4x!U{XBy>%wA;bg<>SxZF)z=bWX&sy`=+m?%C3jDa=SEd`c;D8k
z?b#EpS|4u=wNB2|o16rSxoWmb;nz}^l$WFVy@pM*EYblc5i<9JQ0Cnb0tW~;O)$A+
zI-tGKgHVh<GeU#7KKo?0zJ?)N*8-gCR6vvoQ#SqC)wIcnyV}n_+9WKD_~VZ+=68=S
zlKB@rD;16I_TMhjZ0K<~vHOPkBF1(5@y117!IXnmmzeuxpVog^VMTUsU{)d#8@kNe
zTRuMfb5}^WinUx?|Ce2mE}f5GkC}TRl|rA>{K4LqGdD8MuGjxG9isM?v5l#S*!<jr
zi2aEscO*}R?-;x(f{5=U$rwBBddgdeGt-9-cUGUSV`vRYU;A#v+Z)V|kity5r9l8&
zZ%eAY(V2F_6OWt(3!%s;bqO7w;7xj>vWdFP<6<6wxsA_}*&2GvcbSmf0bxAijgcEF
z^k&+(C?mZY)AV!o;KtBFr{y)LSuKRfUshklIXC;y<aZs)vN^xHk?9rTgyhz2wBL%}
zbeBaav~bga+v+?El2fFKB1{jS@@9H4J6)~5d3davHM{>!mK8#op%S6^qc~J!qcWkg
zu)%+L-aw+^vY<@@s6NK2w*@nhm2luW9k-W@1R@gQnwbvulq2tFn9?ZYDlfgKc|4Ac
zH8f}DQ<SaUZzs*%Itzj4JQ!leTVvd+Fp_gdAWbr7Zx;LRTM3z4M|Y!vjzcz*Io~;s
zG1^%jt~*<zS!}xv$;rS0skf#BI`GApSY4JIXGbns#a^oL7m~G~tGW*O70Yp~pw+N5
zdcuO`^{L%Up{F7iREXQq+DO`WOWyUiP=vDg<vjA7+2fPn{1WJmn|c6Ysv^OCVuN-I
z(-|^j4OW{;ax34fAeT7*6oAu7cBwh`lPn`OM1GX;zkeJe3jnls<hmYjF9;J2S=6~Q
z!9$O>8PMbiX?G^_a~5#kPA2S|kk&nVo^FgYo}SR)(3rH9aTLI1P!IAzra(QF``%-T
zWRo3q$_C)j7ojSc001HN`x$7*8b}l$;zdM%nhNdfDZL!OaxRfLXM!tsJ+G-sAI4Yc
zp4XgxBmzMqHT8lmLr^9tej@qEwX>7@%-@uZ&X|GA!O7GUOo9teA6)f9h@R^(BY}ZA
zLrGTe?f91v-t!{2`YJ|dKh*++6L$7n7)qR+Q4XH8_OUGxj_f-a&GPhq#gpIn>|{rT
zjL!J%OHUK-<W(WmAOa-`tA*;<KGTQ#vYea|6#REfA&qE5?#kle^x2V}i=OwuJDv5e
zH)3+tjav0p7UYdutMM7x{S=mG*;2d+)pfHo+2he}GXs5ltvJPYn|4b$;j)$(BCOCH
zLnspkT0(`0*Jy-)_CGRwVxnG89r3=g`b8(z*ox$JJ?6~PAXwUY!n#D=U#~ZQu^~=6
z+7@G`WOR4xGO}x1C&ZK|n>=`d$sCzFkeg=cf<X*4fIsUt7QDf)$Yc47i*q^9cREQp
zC+2vvrFziLm7?INfC@WeJQFC@Fdg)fwe{N`n>Twdkxa^+2`m(UG3WGvEvR!s;BkUU
z(mKIH1v$s8v!$Fe&=2*)Al_(WDms;AtC}Is{j{$@fK+@(TV3yM$k;7SDZN!d%_HaV
zV0I0OJ=WyvRQLB;8<Q$)2BrX++?CVM0M^^54&)?M&5I)G!e}Gc@%`Wr|B<;X?NVx1
z)4}Tg*)IYbdT(z~9$R}<K78VDl>i7#{#5lepPJe<c-Y8Ja5ROjoQOr8UO(>{NK`(g
zKUF0zmRWRYCqh@rP=QH$RlEzX&}GKVrHuKr+1?#$5m?Vx3+&FPyzjuB$dZDdi#!nY
zH10rZOG~LkI-lgQU&4YToMXY}NK*8p9q8+&<<MUOx~{{%;9vxKDmVgTt5InnfLwu7
zxP|%Y>oKqJdSU;@OqfI~T9dra*EXiViMI!Zs^N9}i3&zI30=8YwyN?NS8p$7<e3m8
zA$GDAZ^D8k?<5cy0L#v8MRIg$WXyoje-g1U!RC2#&hWB65>TAy*Q(4GA8{sq;a=Yn
zzjX`P|NQMCuj!sDYUN8V{Q4*2Am04P8wDdxJv2aG<g;$U<lb!3aJe(T0}**?nCny*
zNxFOe?1-3%vEfp%8C$0*gm8_OdwShY>Df^UEM$8^X^)|V%)eIgpGyUBnYRo8b!VXr
zLEJx?KYc{R+Y=Qn$~^lJ+NKS$?)wTWb&WKBXIQUe7QAVRS#o;+&Axx&ixJ^ij>+|W
zsua1abkj+fS&q#je5)ZMq^NgqM+4L;5wT4B_EX^W91xZQR_^k)?x*Xc?2T%DG_=M7
z+1j{{c_idR0jhn~L|rvgENAJ!RK)try5=nW(}}Af2nErh>5M61=Ohg^MA%qA$&BpO
zKxO{?pA4U!z6SVnm_mL)X_8J)c3K(<^u1TJrHPV|Y>4GIfdb^bmuyZo)BBJQuAS^#
z1{QzcuPWK(mx7Ua4zgpfufh+h)Ze#4g!C~mF<*I)h_$;{d`ytCr^QwN%Jd`bCP#Z@
z%gSYwgURUkWf?S*O(s81tI(~5;N+6BDNbU}ea}H_iG~Z{j<;*rDj`aqfqTgUcAIy7
znK50eW|IY{Gh9-aV#JGHlM6ia+Y29v4Cx*ME130Z2&hK)2b$2u4G6gkTAigs3a`XZ
z<>_EL?@aAd9X<^H7sm`a*&u0>BIfp(U%D}VdO44uT(YB{YSh9_OUplN&Ty=^S{-hm
z!DV52gMv+<w91tC!{{Kz?++w2iY`HlzQ>`8f?K#W>l(9HRT@8VdY!LKCw;^m(HFki
zx9#`U+WS;|v{n=Hu<O6nJvkVQyB8r?wXaUf<kH)T>N8XIB^+tNph-In4;ZphG&ue2
zu$)S0B&pb&NQT-&=|7P#UzrgCA<#F!GHQV|XM5$I(;@N?wOc}}$d#)-=Zws4ETSgR
z&Qt$oy*kq9u4s)FNuD`_zV7y6-Wp~IXUJ@vM>lz=0@q*4e$MCrG<leIBd-W5%KNaf
z+xizD3)0R2KOdk^nqJn}(@RomC<CSJAZk7Gk8Ek?NepVGDJIi5f;yF)h|E9S<x-e3
zWkv+HM<&-pFsSzJB2yafH67B9rb$Ul;uzLAe6IH@QwDv;JOBshv4ZX|+#xK&5PF%R
zY!B)n*7~MYA0IrVc92rKnP&GHp}_O0-+0h~R3v^Z<OL|oi6%+#w~;mkbeH(eZ^#AG
zL;k<^+q>8KmBTW4#HdN1zfRO&1J6ABu@?BIk=3jDT_k!$jR$qCz5XBbtZvt2Ed3Q+
z@owFFHrx0GUKifKdyu$X7f>*sT*O|Tyj33&Y`Wmvk3`~jMMgmjp2i8`)o)(;z3+`q
zr6}AZ=s2er&+lvW3Y)?$F0nn>d<R_}_lo00@%Cu5ZVF^L>U~g#X1ylkkwQ)IksEM`
z>LR?BtbOqmQ$2;qNlQkf=?2pMTg4Lrgd|6a%l#ErdA1E2sRzmyUSHdbh-hLfnm$=8
z#e4H$Cqe?WW4wdR=#STOvaP3Rv!=_I?W?EyRHlpClH{-D>qDg1Oh|3q_p1n1IsU%=
z4wnK+7sTwtwqTZcf?RICtz<%(qKBzy+m!c>fU(f@vkvD#PAteNw{@@Gei`zl#BYsA
zni_zUunKsF%lex{j4lJF9TSORBla)F+3kZ$5@?eZp-47jp-kCv!$q9ZiAgEFxB5%`
zx#rhnR(}cTIObCt7mq6wS!mV0rbNt!p6a0QN1el^>E{wNugHi%BHat(!a9#Qocer;
z9^nr&2@r&>-R1S(YVg~AI}@k3wDt<igt?q7Q?{?kkAb@)T~yY(B_snh)mCH3@^CY0
zQEA`^|L4hwt;Z4JH1k9H>F45Nc1PQfA`}8}Xt%F2Lr^4oT7_H+_%)fFdc*K?F}CHT
zB(OPW|4}iYKbYoOb*Uijh4O=5O`I0gzB+g$G8Dwr;YXMKGHe?nV*q5LiXurqTKye2
zK+eBAIQ^_!#*2d+sn<arUmNUaH-GiDOE{#(@}7&uVITs?+=$aYo_<&m5gH(p$u&8v
z0nDkpf&wej9V*kW9SWaR2ZYA!7BtQC2aH`$H-%u#eQ(Vi`s*3C*T8_7V&z7@#sjG`
z=Bz=H5+WGt^Pc|?FqG27l_>R+y7b~=?oO-75kSjFbd!Wu^x^G<^U7`;5Jtt4_;Gz^
za|^H-E#dm-UHQq1ZW=D_6sgw*vb1^oVa80u%8V%)fS1t9bb<YJQMchVq+c43XleUY
zb5-`$D6N_N_A%vibY`P=mxmv_9O$P<)d5aiM;UpOn)rl8Xr%^1-^3!>wEgf}{k@<X
zIc4h>DV&C$P36N?cc>;VPfrlq{0a=n;>ps5L*@r;OQ&?9quLO)7^l1Gcz>~ZeR|*v
zL|GK=Qn5|amwHKli3OB<P~IfcX`h?hEv<u(wt?)oyS#e8$YYljQ_NEIBW!aM)XUSB
zemZzU*1g%+vyce3fQ?f8po~=|S4wyYsz63AWG7^dUAMVE@^VMN1@yXu1`>TdzyaVt
zY^b|J&sS>HdNCcS%!44tt5f!>5){dj2))t=2Yd_;*qD9elu4|0HPnZop&eV6w!-C#
zgFVs?1HX)vmqJ=}W}!_sjnLfHpLgh-q)scXZyCsl<ie<myKQaPMo0q!HB<k$p0ig0
zInmT(sX$#?W-11lMxJX`LsXq!=-c09TIw?|u`s@>x&;t0&_89Iv!hOwe(t9}^FW=|
zMHWQ<hju{_GHG0yK_8rwPffIlp#H`Sw(l2=gx}eySly6UD0fNuG=<RA-N-_;^6Jyn
zsQ;2iT|D%FRNy$YrIJuTt~0Z{sVuIDd_9>xZfY%g`BEsww(;@4DWAHYPtQxr0YTio
zu9Khcfh!)B%=vWH4;#k1T5TP7MU2S1`x`XkDENI%g17f=c5S~qTm%X|$`rrLg~$<N
z1etxMf@!3#)m<6MVaq$T6nBUD*-3Y3Dkd*WuK)7b`g}HI8`PUBrO%ue_Jj_gg&A*+
zy48n#OHWJ;PH!QAJrw4q@U=}V){AVRujw)}wwo*oyq}#4VjF+8yYUx)D1~miW;kfu
z6z@O?2kn|~zAy_AelSP>G?0sWkv?qO*d`eEy(+!CcApC~55;!l=dzLXmsFLXAJ!Bf
z?BE>pKG<(I_bEN^5X6aSg2`m7LK=gN&Vmd>wY{eEgEwy=Q#gd7%;ze-Rq|`yxJVp4
zGAB*5;CZm1Wc26iyxE@59vdgi^fwy>5C5o(YHiC8BSE!qO&XNb5g?7Y+^<s)0#8df
zkq|R#O<-J~t&1YN#PbV%mvV^qr3K>BtCmGZ6!_~AcG{y|6QL_K@dm1QCJmHqG-q!<
z=D26mCObH{oUA&2ukQ)8W&g(9HPn9Q&j8TnU5+3_b#`coJm%aSXjN#=*|BOM;7#C&
znu8Jo*?P<7Jg7Fryn3E+r+!(+zkuTTR2XR=goK%UeUei3F>6(2DoBPhV#{)PYN8{5
z?C@A`rLFsIAKilC`&B(d7o{O!C!Qu8IN7T394nwdh=c_nFq&pvscl5aO!(#&a_J-N
zKmZ0&Z$koWBd-UatB+e7Q#q0KCZHtpOI1?Rluq*1uQLyf+To(axOCB4TRP>G<U+=_
zY;iH-y<G-ST@Xz7^0Z+Tnl;6P2|nyl$HVEAd$=A3v*{GnJiziVk1QJszwLQc7x`s9
zx8?DUc)7}e$~ek6qzE`%PJ(yFW0$vvqPWgR>TN%loA-;V0mg}!p}h)7c?|LOb6?J0
zQvy3qUJPlA(qw-f*@;uWSkBW+;#8gLNj<ePQSAP0-TR=^xnC#J@=U=K(jJ)@?EZOc
znZ}z9wNBlVQ4GGAmwR{?S3%-HcGe_L8EHmm&q)T&OYi%2q1N3C$xayXE-k;(LCaMF
zdpmkIRc5T%i=6NZKNu<>Cb+S`Ioa|%&|@%rDJ(k8e!njAND6xK5iqN;^7{o@!osM#
zSZ-Vh#Wy`I55U-G6&h7L55s*6dldT?ZXH1(Z=Kv~dxk<QbS=^k@n~{%;68QW9sni|
z6@4&XH&L&A7!(>G7U~`Oc@7l4FcqGYz}b%}j;}Rj^S_XHa}Y`}ly?_%%*^1vX_lr8
zu4oiUKMhFs6PL$+JiANPv`Hemm|dUDc>W4YYT{DoQKb|lX|L>vKlMU8aDCEHJSZ;2
z{nxKgOSI8JsX|ZqbC;BAC!fn?LvFca$5P)of!aFVY9@lYwfQ2Q$7g>vX`NY2aiW#2
zl>p?0H^lZ@9+vZ!eKKxg=S6V3yphwb68wIie>qt@&@Az?hAdBo=i0$x3qa9Qh099#
zXpEWw$$%jLr6dphxH~`Dx(9noX%=7dOLp@_Z#sZ#raPS8&3&v-Qn@TXSflaLYCs+7
z%g9c@*SOMYX{##i7lF0>GnG9Lt=!otX4BP7wBpusw->2V%`Ag|&)eXRvUA{+g_juQ
zPM*21ZgeIt#(YCXN`Ky}njW1}0cce$#$Vo<&`WY*e%I20n^W}P;8%T9NZ;G?+>hsI
z!H|l|wXJ;MELXPq4gBcN(q7?%6l6{6r6mlvO&e2#qDi`3|8%p6AJrSfn+?U#e>2il
z@vj)aA3Gm?5KvZ`PC3-kWlr@cES1^ke%4ye)fbW8^AFMie_^*>)MmnED(gr+7cXD$
zZ%5&4)<TjOJ+h7KY*Hv(8skuB*BV1&YLm`VeqA0Cyfx-hj`K!^egx&~gpnVfv3~qE
zw&l6MqWABs^HtBrwzFzrciStRG$1QjzeUJ*FO}((SXS4{clM1{a{$k2HqdF_sAY^|
zq|>4|6eWGC{+C9@{3pAzm_6#LPYQKxuL1y7^n4Oz)Z$Tr$`KLfS$$%>QXWr;D#h&r
zlIbyJf>!@6gp}=`eyK{heE;tA`CqTYGAheoaiL5Tn*1T4(IcG~!C=FrCU5&U8vON-
zPIutGkBDrupHuDG(h?u`k2*WJ@yBBmQIm3Z(fl7rlpHRYU3*M}OXbK}F*JH{W9TGL
zS|uvk^}*%<(oTc8UV15Z!u8A*!R5d>eN^*-vKJak+sbsU=&gLzNV!1M$)+4R{!Mj%
zpehI@H%q5}ggiT`&zvb$072;asVdQKr?v*}CD&QX$c@b~bzqCu!U5DWN+U1dNZ#nJ
z-v3(hTcGjih)P9}8TGzg;Gy*7CVS<DfN6fHI$iZe*X?~CjkVw9n%>I8SEg}}^bBk$
z%_{fI<oe=}>WJ*dW)W=^#ZEhDLQd<mTiv0?9mSKStQvxKz8cZ?OxfpznIE2;+3yb!
zK``CspW9M(HtY`9^d;kikaq{RPMvH2ilT9Q3N@E)c^<&c1!MxDKqPG@AVU24h%5SH
zzPE>4Fd9<2+er=liuZ*@_SnzGyoZK;q++_JsK7%9-!I&#$tXVl51ecZiSvl{spTIU
z&$OUmU%a?8eX6u(A4XdZ=UXsdAVyd%IP~tW4`bTY_KS{4*-hYZh)sXhF*b^t<uq-Q
z;R88ppf^b<>+`eK<#wZcD;YO-D2fl3g)(QavUQyU%42<ROepoBr#Fo)pEecU;{U`Y
z#ZaOzE7O+kcB!-!260y({7woMUl*>P?7g}($Hd@E3-<s=JDX2FJ7sZ$GNEBP4xjV_
zhpE0X{dLkNpORI{SF&{C5xI9)-xX$^leH(=ym*dk1+PK6{qD(0kER<GjLw`4OE>x0
zL+!8p?EFf>Oz)0HYHvVXA;l=tscQL<8{pCI%{{C0RTHbb=aXw#>zuySt0k4}B5vH_
zIx8MFq&0<e8@4D0OXWs_@i*}m?=;*hC-hGhPEzdI7{3m-CT&%`WIxy2E33rwfM0z9
zxX33QM;Vc7n`QK9f8>vvI?q=vs$tbp>t&Nvn<H<M+oz`=V$s}0a!u^Ak}EF-TX+)5
z$*t~gTcL(ttDz{Cf8BvHMK+&c(b|WRonziNxD>4x_7ZHSX60l_S0^cj5n|gySd@|`
z#{w$MqlOzoY#KCx^hkZ|vN0mq|87{j^~T~Zwp+&n8A5s*<g5cB0xedN+{&KUE?qpz
zT|AjwGDg_W<d9g^j?2qeH<vBiz6}g9B0?a{NTw|fqtEM<tbZR@`3EznnsH5O5ND?g
zkvOhmESymF1WcG|l?m0$fBwG!YvoFiHNGv356;>~j1}gzEIfG`U#-UW<4!{6*r8hi
zyHL{9(JXvk0j9ykK!3DXGGxJT*LbO7=ShbDQD;$PE7pB+RytDOniM{*GI5bPJ8bKf
zLJgfcmWE&&O5sPYG|qW0Jz!;RUmk7~?@N1xJwme8+$`;?+CtEpVqV*A;CEVJff=pb
zO~^D@zPqX6T5>F1E$BM9B=mIyRZTyG_B9)>hGNN5!3>e4L0(9dGfb`*+d6y7mOuM#
zcnxbrLz%6eyy3aj6U62ku$yX;64JBId5iZvO?6EG(tVDhxYGh|@49xP*`K+<SS{Zj
zFUbpTK28G)bw~^%L!7m-9NPU|fHUbcbymcIU4*OB)-5N6A5=x}5RM6*^H%~FT3Jlh
zRt=uFcHi5j$JXm<UiEYS<}xqRTozdQ=hoyMRmH)VvS@Z9)242ze=*32Z$g3&odwR1
z;J>}rQR|7Ae?yUj?GnFaNo30tPYwt62r6Uzyg#-j)&~(VYhRd~ci%d>Yl1xd{3IBG
zHBQAt%+xvMNEa&XOD7oqm`>k9P8S!;#a)?yS9jFxonko?9UAb5c)DsyR}N?)MzA4u
zDs5zWS3aCN#<AY%m7ZMW9)gfuOwRUiWs3rgq~+Wv;pvH8jlF{5--VYybtfev)E?I1
zr7zG|j8g8~HW$q=|5|ZL`Ny@;_qTLAn)LIk+R24FrB5Iy?<IsTb!NndIN!*pR&gHH
z3e8h<#v-|UVmCF%BN`ZE7bZtFKJY#cVY{waYjYZiZ@le5x0p{YZkeT*j48bhHkM6X
z_vu_x!4I+_ViABR1B4`hJR5m}Vi|pgjepT_sfXQzywAGDPdLV=QJDm_pImy*aPhp*
zl!?K8#+BjPmA6%*x75?2MfB$7LknHoVPKnPWfFZzN!#bap*;N0S6^(M%#2(ioo*O0
z3E0UcTlTzZ=<LIi;{|5xDP&HN?PV*IH(2f^;}56bCqP~zfMT{z<*w@+$i=;uX{O$u
zXF~kbY<q9Ul_(Jc61C0;#U>=aV*b2Ihxm_!Ob&oyF#?TIp`$j&1q|Y-K)t30*|f9r
zt2GpZU)>yo$y_*YW3L288W-cY4^s3(%296EWtkdOu`+3FR2HN!lnL7Hb3W+G+_Ohl
z3ZHC4p8Udi0>;Yywg{HYJ}p8B{p4>Oaku_1S;e|VDH+?sE;!7#6N9#W2Dz6C>Ceer
zB>0Mn$t}{MjM>pUszLPCh7#{q`0Q$yzdkshQN3_2plM3()$4b#wZaJaPVIp@nQ3^>
zAn-nU+p6N`%LZW_Bo`-^JFfj*5!*7(+iwB+xFJVMHBA0XxYU&HdP|lEqLUl7LBu}P
zSAupYf&ndkq)7(d4LFxNPBg#^4kBQA*df%avwtkGRH9_oJmbg`e^jZ}nwQDHMobR{
zwQ9FORpH;wEl^umz^brOv)$)?xX$bj5IXRtpEL^)ztqd*>PdK^;%7N_-Z{JO)s4_R
zA84btf;QM{#+ZIO28Gd4SL5fo6hp#eXw55Wko*n~952`(0epAx)~gnLzpN!3_fX@<
zh3p{5;z@lqDm<u@_=USy*JrKWE@%rHZSis|JvS&)`?%8si<Ai*s@hIKXJv`boyL;=
z_$h|GCPG|G@nU)7x4y5?l+oS38)hJ>k0oTIn_sZ*ik0ur5C`=TNEQo>R^(VeX*5gS
zS86k>?Ap%6qYItONhX0zQ4c!m#7PuV=d~dXoG$&+-Eoys&(+PHw9|IiA4BNq+r@mR
z_9%6Bh0)l>jlE*j<TshkqzX+f<V*>&Jcr+y%{MiesogTo+_dJx_5#;@S7{^T{j;8J
zzrW)Ie8=|6!-57VBtt$O*fRUOJpNC{ResI$yKmZN2(8RGSavcYLJY@IihbeLk;;}i
zrBEkRYSNp`pD#;+sM?>kKL}f$n1G0DUALPTuO-hXl$;eMJ>BXDH6GAB<Zv;j(Q7Nb
zbky$nsqc5T3`k*pm4sv5t)PZx*1=!MtFd<w_+;cp<A5_YU{eC5%|J;%N{VY&nKj-#
zvvBm=ZK&q$w7@f1TX(ITQo)iJd53Y4hC2xbmULC(iROuLOgzHE>JYYN*OTt)FR@Xz
z%AMW90+Xp3t|6g_yx_Miu@?2#!8-jchFIMdUOlXO*=+Js=Q#!#41nRXX@)yEcly`-
zenRd1l8uac;c0B%rTb-DS~|pH+)w;I>s(QPGsEgE%NcwF#y0nQ2^Qh)m`^RUGEbdH
zg2Q<i(iSF-(-W&%W$Izv0Hm?@)joXmrLcgp$4$P&tM0+{mcT;Nm{!4U?T|R9@3BS4
zzycA|!+6(1jsp3<j7QjBFXt+2TDchlQ`zYLTX{{GrI<%p(|niFv`z7!v9E?{)zZQY
z^nT7wT)zw?y_hg_R~mg;24^p2#4fFjj<=RoT&Zvt1^!g)%}$`WmyNAWD|?0}gMJmx
zJdiC@Uf#0>2@P=`mCf<*-!)ycpMlv4_B0G7G-qm+_jhup{Y=epfL4t#*_UBF!z60?
zgtb!BR%S06n3~O598|60FARk$O~%{h>bx`}8Ux<$7TNE&4}*6I!pqjFvj=1s$u)ZO
zEsiX-zUWa2#r7(#_B-v5pg!MzRLfU=D`fELUaq6bUj&+*h5M4pEic6NnZd7xq28Nw
zVit1ZX=km`Ec;Bqq!bkEqxwVb(*{lyC^kcG3@Q}|Z$bhO*~JO&ZtO~<!<<X`5))}d
zjqF3PyAifw<6}%YlO$AG;r!0Ylw+Mdvmg3@qGfy3)tY$L1xoWEhut~;ofFrcihrwk
z{k%-x_M?KhkV)&U1F{AZwS42&b>FXUJi&Dizob>-Y5*4>nHx+8b(s}+b%*xk0e9MQ
z>`2d8w{l>siiTc@aG>Nr<y9NFfQok$7np{3A%X)CIEE7KQFH3G+;n!m6+Tc-Qa4s@
zI0Vp!VQ0B<bXs@GW>1odf$DTU@EXEv8C>YAA*x{!7jB<#oFCaKww4?4D%mmkB=qj8
zTW>t}UsLw1qjjiWbv+ksM$az2aLn(h3DO{?tI)o0SUTGMrNK?ucVJ)8Q!HH_Dl7<K
z!bzgSt1c<VwW}{$F-!U;5t1GI^}@njY}(j@nqbcqXx!YdU&PL%b#ppugO5ia1Bnve
z;CDUa$k^=2<0A(*=l>~powRMV=2)MenP8NMCQ?8qdQXj{jxCM7O<8<|9X*ROnM04f
zG-edC`g*!(Vk(@SR{Io6gUVQ>b*yFIHF9a&`kfdL4SjI>DkW@|KVGJ}zv__`bH5eo
zc7c=595?erLA7)z2e`yFp%vb!9}A~-#QEEk6NpYRD%_}Zq)IEghHe~6Dg3GYw!!2P
z>G7P2Z0D<60WnJfR!;50%t<n37N|X@0IB4mf+h7!y^Ggg*}uO@3A~N#s8SLJDc^fz
zg8x)Ld1Eq|6-REhVnMHTIeG&Hh&|?aB+phEKOG4gZ+p_RjR(%|23a_hOI7zAP$|9g
zA?&GYL4g(#l!h6KdZXxmOT3==VShiz2x<fDYB0h+)9YWDZT_BLgo(9##Sl&c6$<L$
z5d+1_A<aheyi~JhoNu`~9n3=kwYbd>Td3*^p7iuvmrpzEfTNA&oJP!JC#gpxrzbx`
zz2*oTV~Y>5y?j4vnuLiP)XyCXjAhL^NITUvoFDXT!^nFE?C2P#MQg&p>M{uo*wInj
zIJ9j&PV~o)S$}uzNslBc&A(8qoSq;?`kf1Vtn9m=9?2DoT^wDH>f>$ol(UZdl!dgr
zAG`bt%JoqrI^%}&SlaRpFI(QNUOAg$|AHPBa)SBjfy9n3bZ5nqJ9YuFktyERc{WaP
z5I5|rB@x*Y{ZCA7U-7HpbDjg0dH_XHPxu1+k5#WV32Qst8;05;nhrG=+uHTA$#Dgp
zY8Rb)X*FpmY9)t3g{&a_>VC|AK8wbti0bDzDLaG#Yt2~)XLmp1I3@_Qh(OY9p_Qy*
zWm^4BBqzy;^j)F4oFF|T^YkvK%z@XPLYd1xI&5{q7G?C?h<#_;#&1`AsXLTB|LRRe
zFe0!*$HK2&M!o!9?9}^?#$vg_s9}Xjl54^c=dp~$3xGo78b>0i(Q(84Y@8T$^K<yq
zIM%?JFI9oIces{5$@$TtRh(BEC`Tyr?0CX@O%Vf+5wK#AhwYqxt9x_B+_+(OG?FyF
z_<Iy7ctdFA@|TfOweLi)2S=uvkexYO;^JoQp@EF$w#Gj0-sRm;W17D1VMcQfDgQXR
ztk~3vn<7uM9Fw-mGyChFlG%QL17u-4ak?r~PHkIzbv<4ZA8q8rrRnhzx<|k8Z(d9f
z3mR~^3FY)y2SmuKNPv_CeCy$JFrs^xgZW7#f7Zee9_lMYmERB41Rg4E99QB7`iEc-
zeA0xrU6~9W)pieQ(NVp)wi=u-$Jyk%US>N*seMu3>|fdKN0s6rS_tk`YraDqm(~5=
zh7Ua6E9M+14Gs1apNNvf8CD`&?lC?+y%po(njO_?%Ye+2IaG-aPy;``idH%riko(n
z-WDDD9ueuTE=yve=NPb#b!t#%fQnAlZ%%@Bg%Ud7@}`ysAbG$EX&SYDUpt(UHyCa&
zy~;cKT6Kh>{g}#w7cL))VOEd_yHIvVK;tUbB*kz|wImo++Hd#WD%(+9)RhJNQcG{Z
zHoEHPAsyxi1=d1ne8aGp^YzaSvw1AmUdIbr^_f#fRt1qiyUrY2#R(SPd&>nTO&;9R
zo}AAPu_NLwNLz{`1RDWiTZalopo7K&<18d9ed~IH8$>yd+Ujy0)xvM1c-{hoo@Fz_
zn0g;CWR}_LeD+9m6^1a}v8|nTeI|d)&5xSL(o$Sf21L%1)Dbb4CRON_W!iqi##m9X
zQj(I_dAjE}2Q3PHx22B*hpoh~8ik`ZvD+H6o@yqV99y@t4PLR<H#jw~K0GUbjt;f6
zy&BVJCbVp;wmo0N5SIO3v*Z%M3o?%eWq*wHeSY^EU7bSGU4uN$J>tnC+azcVmW)J}
zM=h>)va8tp!HC+&MOZZP=im2)r_X6cg!gnA#Ek*U46DqtY@)~4Yo19vZ{@J0ks~aV
z`9E^6gIrO<Mupddn<GZlP~+<yN2k(^zdWDG<5!D2IVi`8h|YPN)OxPCOhJ9-XQR`l
z6<mndX?<YvBuX)b?Wv5XX3bkFe?C{BSO6r@7)Lv?!&Ro<!vl?tUf1Q4&Ls_VYLg6J
zA1*J&0XENw5{rgAc^P0=F#nSEZWaE^w&kL#ZD*w=!d19XVH~k)r#Af^uCGrAwaa`?
zTaNF6i3*Vi9666-%KS`ZZdC_r;ylVopyJMDX`Q*x(xe)vP1i5pAJ`jTY=wQL8|%%V
z`<^mv6bA`L$1>;L6U7i_PPATRWzv4Gpp$#1NoR6m_#g~LZXSh$YFIEon*o{f$(El0
zk4jtT`HK%s^F&b7ELHVaHCIwz1?v?}RSH?aHrdTadvDd<O?~fHhX{6Q=?O%BX0%Pe
z<0RC7MqamV+Pt5K4KJKaxeL|mfGaqInaG?~?Ea-ydBbE2EtwZ;4B|Mtsl73Mi)$(M
z|19k0zucGM1A~b2mt17#*D-Mku$_6f(dmLa7BPIMQ$xGIf6`C$X4G+C-OuU;>kh;Y
zz_+}n$Mvt1@{6lmxwKD87WMJlZTrO>K@Is_Ac4PATKiHFcyA4uoyNarH+2-vl<GS(
zo?;oiXY?4XkA_wimohrLq#x{Pd%D~&V)eO|pH&4628@E8wFNVq*xD-Q@TKwix^b0D
zjsthGEvqTEXCTL_l%2P=03y3^d-P+f^ZuaZi5qdIKKcfBaMGwtM)p_O?b3%0Rad0z
zD|3Zq24L$PI1mE2wD8kBu4oRERU}+&>Np&q$n<dWU>!?lfkIu1rJq1maS3qw<XyeX
zD}x~=GL;PXw_^z5(mdT46Hz8+LGH)bR@I@lMi*l+De$bs6*tL=lgEFiL8+c>nc(T{
z(Oe`>_?&`m=}vdBo-X>e_H5hg3-)?;z^pZ)N5_<=l0XY4AM;r3jIrpM{ZuA<s35k|
zQTBuC1$F@+yr8ZLRA3}%W6it3=8_XOIN=gMbGl2Fv-y;FoerWARMyW0bky4J7r-=A
zkDEoftH-WI+g8qEq`V2;4*(jjV6;WS3sf(Nx9hSN)iI-K70CA8p2Jh2kzf0Ch;Hj%
zlzL{x(SR!+p+?UJt9n<U+uNgjypMb8*}{*YhevD$_y-hu=#aSsI#ps~z!H?*&c1PC
zEj{ECBAf)as`HpxM97a|jpy@qO!r<)UhFAqEgr8pR2N5m$M3pEhfZx7_rB@)te9VD
z)LfGu2ZOt#M}H><o+%4Wy-Yob)<iX{?T&)3xN$+;|BM&h?3hnFCU!oEC4_OvS(>t8
z@V@E8Fa}7sb($StnVp~oz(hkctiY@_Kj$LTGoq@y2?NUY5~GI^p}+7m$~^Y@$k|GF
zs(<`pB|J#g?f_pe9nKtOOX3i!D==cgd?@)oZh_K{?47kroE_J2JLYkKB>5)aLHQ{l
zb(m$g6sF-IH>_33G14JZ&T_g;Nmc%HFDM7wx*EH8%>MVBN5x6K%J>C7Iw<FEes?A{
z9W)Y{eLg*@$bU*c3i#KUxTpArYennRnnC-M52VmzQ0RqCqN*UVx`DBswfGQc$-bOV
z&me<^(yYRj`_0BAB7->-+N-qlv>8H%C8U*FKSdbS_cB$tD(}>0L<6FEt!e{8rv1@;
zi~_BFu5AK1A#{qzDI4X|3Ta1I5y|0t7DQmDS?88Jb+x?bE6j^cAWC#m+PBUng|Y>z
zE($60+E0MyLvQSX!O7~Ng*R0%Djg>2QJxBHwOhTnD4C@HCPU}u{rNcUe44cLH5n9E
zh>I;5ANS6on4vV59I>Mx<C1O|e+{Xs@HLCFZt!A>eg~oTHpQBm8e_=4GWhVAPh$@w
zI(rd@m@HSef5WO?*sKp6bZ*xj-G&G<0GMvDIH8!F<jLL8EqQ2%c4M&K+r`f<im7FT
zFXPuiAddumJ2Ttu)-L~aQ;ggq2DvrBY=$i-yU6FTm=#m!&RGUTRHI3n2#n)*FY7*H
z62au~v&p#f*B3&S(8@v*s#Nu8)JX^dX&5}Xx5rDaIh|z9`GyC8u&9oF@u5vXf>!QU
zOt-@nEFFe`t1JxqR{iWcdUGmy&$Np8{F?`jBRlm`Z*X^$IXENW(dR8Db8j_6w9*r_
zYvd^yG|y3fIenH<n;j^43Fub!Ru8Urre2`Q699R>nZ>^}@eCU>Qm-3b|KZSgfP$>b
zF?)&=TH~5~9)1GQ^q~EbB9(XkM_$_p14QJ+AISd}ll;A+r%W1_iW8vWo1z`g!NESr
zeKdsJJLCWyh`3|sI~CFh-#F*D-Yd0fiLcEWQ05DtYFLd*e=6TA#z^n?9Ey%nRYdUf
zHrqnT?xcV56{NCuF4r*9di4!jHWWX6PLb|ri6nDotEx^bH*|C73}>&1JE-h}qvd_D
zmr&iiqb2?k)-InOJ-d^bX)Uut_sQ0rFVCR$Woh|)U}XhqM{{(rcSue}3nEgl>s<G8
z=!?&$6UREnlh0nT9PRF1It{@!AbV`0dRTzhN3jEgCSEaoct(Ep2&<hvVg1$h`|Ou?
z-B^kL#Do+0-0DYzLY36(?3Gq)!xlSY9*Me)g??fLx)As)huU7J6MQ7-i^L#W6>jQ@
z%j5WAEXIUbj~kqNa$7dnVq>CR_euZ_MYW)%>};9mS8?0#X3TqVD6pM#Lojk~EY_VW
zxS7t;+1u!1v03=@o?<!%b>0ctant45BemwyJ6!jM;r_ZPH?bz`S<Sw1pVJO^3{@~_
zgH{iRDV{5R!nbn7EJDlMRk+vUaPC@_u|(v;q6ne3zc>SyF~OZ^+dWx)3jt*#Wjo<~
zL_J5b>Ci2OqCq_E*tVB?t&eE6VI2?$g$H}dSFjWBuQ4_-U5y<VvuhTFJ;`XaV)GKM
z$T*xwP%K-wFB-t|pPKyAX9gsB!07IL%%T)LfMae|;$}3Rz0-iagXe#yenk1mt4gI<
z*64rMoNmhP#f)od%`O(6u1*F{TYs1Du2$M!RcFeZ3;2(FLd5x%`||ROB^AbKLw3X)
zc~E=)rB>F?=9jW8XgUZi!N|eZcg?7QF<N4C0@gWOxK51J9nzO9+6uf1kcOH6E}j#k
zMlB9s>D2Nd(-yU07ueHy3qfPASnl0*<`U-zA*TVH0hE;E1g#Z)E4rMwfvkq5%<iHq
z8>=%Ko)x+?)l+&XB-u+EfC)|O0@G$njoRu`1{}J|=pi^)ROT74wwGLM(Zxeo^m7s(
z!IlBdu*xumFP3zh>{2d%x~P<#O1b6UtI>b^tGyqM(sQLnY~0YT9Sw)J)eARWssVca
z)W3Py=Gv`*y$Xk)3<Y-KN)&(S(EKC=kc7V}s;Tr27q8Q%-H{ac*x(NlXhqmqMzbr}
zYHKD8;PaY-+{CeBX21kIU_jFrtqK;&sPO*Cuj)r!?U>8;QRy&a7@)Z?;}_PDAXRHV
zr_EbQg_3jBfVVR>Sf4HAMKMAxseB=13oyCY36oPQYaQIlvH4BHhB(kA`YI`*Q=(vK
zlIp%>z2!fW8jC9mrK5B9z~xVy(0vLI;rOaCx~hwM%VE^&d~bbO)8E79!xy7%xLt^N
z4ra`DqMxstT~2vy!JQYYIy7_!5tJZ6*&Z9tFW`^FGcn%UXlPK-Fcke1o@}{Sr}qeL
zUiPtYtWz-`^46vG0xWO$*AxDMhkiKA&?ECVu%%q6nD+1Uq{IQzW0#MYzdpD~+Jy!f
zp^8#vV)AfZ;v2(1U9W0teyv)$piZ2X%B|AE%>8UE^<qWZhmz<`?A*>AoFb2t2OF&b
zu*(&^&}FCohc+^qr?dBnfhYQdXXHYA*X@C2+NBrS#J17E?t;I<T)ePEG2jf@=#mr%
zopqM_LPgdY9nz}VIc%+GLWko;L>5@gE_|FL3hcWo>{>mK*zNN^{U9R3#%1OnwtwAR
zmzq(sz?!KEU7+A_jdep2b{tFq;?_Hj8=x8`)=pI#AWquS=O=rZgWr4nVfPk7iI`Y!
zQZZ@DDfTOG&|~<&kApoEIXL<rxKr6Jm_w($XSaQ+3DP5tYpj*ycWCinQ7eT?6x?IN
zK>48{8#c7-F4w&`V9iVuRjv*xsn=CNPvIu+(;=A3YG1)tl_n+-c~`gcwc!h*n|8wZ
z1J+#EJzouW8ViH}Yxhn_*$1CdW2CG5M<Toex(0N4c<6Oyh0xTIk^@QIccD)OW~v4J
z_3Tllxn>PMhud$O?}oj?qO9?N!Zt5hvw_8t$Tx+0oZ9^xt9k$*Xh69yVD(nQ0FM8W
z0z8Xh{v4f>=V5Pe+7fVCF#>RiNChAH;c(Udhz1^G^ZGg0+HH3#FU99`uLKq!zn4(^
zbE%fEACA)a<;4@h`mY&}-A82ra#d2ljH<Qev1mKany{?yZ2Qu+dgdjULSw}RWJjCU
zCAP4J;3~8D9K!A<5I7qG413ARO-Dea=x>p$lVH@`sKtIwf6@j&qeS<xi7R}`F?hV`
zyalu7=bVTu-ly4-oGQ)RRFq!|_mX4fhv~mD`N(3d8SU8D@?nq}9Q=E0FgHpjB&0BT
zPN%X_NeU=icy^$$<BaU;(+tX*dk}5uhW9?`{~G=o&5{lGW?oNUuJhE<Ui9qpyEelL
zqr*F}6~uV(*(0pcXA9aI2<$n3e&bV|u_9){oO0J{lo{QmjU;T`E4y#GcW~N^HZ9rV
zVSXRNctZ9+h5yn#c`EvMJazH%XfvWreYOPok7`9+uSl2?_SSb#G(wMHQMD~bjxK2D
z{7a>oElnd4?FK*ffh4&J4@YI1RK~l1X2^*p*0rz-IIJVXeH)VwDhX(%)WM2o$4qzV
zpSG-S3c>_g1v*;7H|ar1v~>Luw(Xb3rq9B|?qMr=B)1>*Rmsfk0R$PwANbnug}avY
zvg6r5U1QY1!4Qv2yWjWbZ%8}qPD6^yf%Vlie5{9W{zoYKkTz+;;%jr?TggKT4b8wZ
zHPb@$o;x)aCzW=C?IwG!&{9n#ZVNdc&}^Oe_u?_RLG2~itAd0fjR!W%)=*y&IF+l;
z$p&p7|6o9VK$|5UgFUlTT6~J#Ev(i`_+fgHA#b(GFWU!ufGXvKT#MG=DIe=;nGH=g
ze3|%f18U$Z?0WIjHi@%?KlMlVA-khAmudx<Vl(f0e?U`cC}7Hv2b!+>3R7?d-v__t
z#CpZN<XiCFd~N^X5+yf}1t2Dw?o&%nh53JX0d)EIO1%!8m{pLHfPw37P?jPSvxlD2
zTyd-p9PC1)pl1@rKzY$0vPc}x0d{0E2mLOI*&%)APoNHXiV5@`Xd2O`BG|Ww#}W26
zp;dDR3`;E*wgayqq{&$OZ`Y;(hYg>nWueND<WqDEgL00Hw4L0w#2OFWc6g!*@N1fL
z0rvwQu_QGHGUK?BoDiJGm+ATyJtaJk0I%S5p^NewtQ=v<u0jwp%D=tJsorDTYx|Gx
z>JpgIe-PnF7d+3CZIyCPHa1RC4jyj1_D@$31B!y@(8w|4D&FNY?uLr8SGaZya`ayv
zvRw0-O3%66z5@=Yp+yHXMYR6jfUG`64a+<euR^N=E}wyiQBJ|~TB?_CnZ^fr^qR&;
zWJ)XUF3skH?y7-NUA;h~Zwu}@)H~r#(y+MpP8n=Yy&e}L;K?x^kN+Vr4iTt*MWHPB
zUR-Z#yDyxH2KL8<;Y=L8WF@EVt-6d2jCsm5DY*0PTzSqFuLkphU_&8Z(*=C`%wOL<
zfcgj-K}jb~F(^w;%vAZjgC4yBUv!Rde(;kTmCsi>gOQ5$d)@PcDbs^UI|1Q`3e(aJ
zT4&u1h4gPvKKHeG#J|elln%#(v|j+s?at;~8Rb8YYYCshuxGHf+d!>l=Wc{0D+$rc
z4>@d5jy5a*VMwo;exFC$0KgZp>#6fJlw^XBfBQbr?TbIs!P>h-c*ph6CN+$*CzSWu
zzi35~bnHKk2B-g%NrJn=ru|gDJex3N26C~9fyuRJ75pArG${FGKfO88+w488xRZh0
z^tC!UE3xmm<!Q^B+h!3N+lu9SPOEL)zGxys#62@fym+8@Qi=mB5nPV)fNRjcdimZ7
zju1^nyIr;oj9+I$n_h1{L`YA#B9A3I{k}lqo;Py-`IA@O6%tu*8y#Q=Ly*;0XLf!X
z^k;YMcTc?oO0I$8$vMHjt@sOE<ACOXtyXADv6?OnM))yvyC#g~qmU}(M`#*5Y9C=Q
zx5)K2fHtYtCK2y`xzC03U;nnB(FLJinwV4`_1PH{p$pmA2FADSjIvhvYsLm8Ov~l5
zIw&M?K64B0`C1}NMoD63h$>-ou=kh)o)44)`-3}3HBC@h3{GtIz{wZ_^vux!*1p8j
z$LwDVYShL~JB8Yt>@iCZXJ?^>b$s9Wg8t~}DSGNCY<dv5zl1#IOCNW+y-YKFpNgDE
z#Y?|z($gxgd(gTh9Ro=r#x<$&$#e}JepnBS#9oO8_RU~lf&JIfO0a`14NuXOes|f*
z;=1aML;op&sfnLo-=NgI-OjY3)D+Bs@UT>o=~Zb;k__2qz1!C6{OXdO&qB#;#i_|(
zOqnDl65lvTIqI)x01x?QO23i2`5QnR?{Mu;UgSDCa_TFC3U&u%60n?LwdcKfh@Rvb
z43C4&K0hz8SfQHkfp+NcW}P_1>|*g5V^@s`eYkQ8bP&Xcok3+*7R>357&NMY2syO1
z27-IrEX`&s_Q4NK!@2eCi+|jKa~v#q!bHQFrJF)BQk25a3olBy`%8!|FQv}XR;kh4
zNBi&_6l?0NqnNX|`)^ItW=h)+I~RMarX^qR*FRLqG<}?oda(R=DMJiPf0vjU@F643
zjOxDy;)r6CU{}F^yo!l!pWEH_QP@F(P*@S!r@p<HEXrrw|7(cJe=lY(U~*mpOM9AN
zDIzvG#N11K_f%o`g<AMUrse$l36p={y*!D>@Sp13kK;pRF~GVw{xbQfSP5JgpY1r?
z(5BFf^-Q_}``^CYH#yJ2$u|y<DGh|X5#4OhcMJNzd%c=8hNqo~1S&pn7V&Ocvr;D~
z;WRh60C13VjfT<pCyeV|8am4GcK{v<n*Hm&8nrFsgXQZuYx8VS27AEH(-EUDW;EP?
z6{SMGk^UW?;aG|c@oh?swtxSz@B!%v^Cx&D*bp8ABHtfY^!A^TOFpZXlOGTq+tAU$
ze-?dm7A0tQSErqQxIUhhf7(hEtd-|Rd~`ZO3!cg*I6*!qiU~yUz~VsC8?}dzDXQak
zC{qfuSI8G;Z%yOegSEBAD_2<v7H&U;51u%VYU;+be02ru!-4Z8R;%L_+^?mk-91?&
zQG1o9J14d#%R+nJW2WZ(Q`f&;EKosYZqOZDn_V0a`z)Dt{YtRokwHV7J(x3G+8Ldb
zJ7BkK3MA@x(4J?zcf(!#r)Bdcr`~rd3DBVc(bx8ws4uotN-CUm?ZdrsM92r#;eV9J
zp60%UQ>?{S)!z0M?+}!OCt?5nzuJ5Aa4OgKZ~R_nnPnsM5E6w%nP<vaWGK^;44KO^
zPb&$PX_q20WGu;$%FHUEiDf7<lVX*yWFC9Z+kT$!^L_q&kK_I0cO1X&WB;++yLH{y
zbzbNAIfwhx)eBTzL${NMxP58`4%g+Wq=Sh_PxaBXx9JVTjI<7BIzK%Ki*`LKpt|WA
z)k^UxNbjuJ;3OV0ViM?y_i|)kPr0NU{CzDef=QWTID&;sGiPfw>y_4#L+_fEwu%Hk
zJfnAf1LxeU@%Vt&kjL~Bg%j$tMQfvQ&G_uuJHU7e)$Zkn*iK3$7}`egNuS8)KtcRW
zEay&?ai&Ti?%>Z^KdYvyk2H47O!UVc`~>w$6@LRPoStM7HEY!P8)MD!Z2lzF>q7%x
zfhhUQfLSHBMYbk#Fl+M`Y+enF3MnT|M}8nHW+0*5w~!~4s-|!M6<hU2M-}b@waiE5
zag&N|lZvqw4`9_bvCN&iir66Px9!8j!?!G9>v+?~126L+!rv#y5Y`6}7fR5wK=Z#i
zvJD2a`Wml4PAd005Pq?FMUKMG1c{(g!21aJ5R1y>rp`5ciJW(40*$z}a@K*~<~T3k
z(3g5u;1}gl3cpKcC!FTnZn@A-DnK24#sxs?>VPjRDB8UfoOFimR@&0K3gjrZZXp}5
z1B*d|J2DjTk(H4l&2wI4uwip9y^fQi%&i|gMxFs&0tCo1G^}tR^;g7l$DaOc@T;FV
zl1mHsG$V|aPcG;UfV}Tl7Uv6RXvQC}TObkVmYbyUquJrZZRB`lqi~lBG@dRXlU9n`
z0TYFj8n(X{6ps~Ou!K4oOsUZoCHF5+v9NsWC@0{P1Pe4Vpv$y$;ZAz%z}k_f$ci8v
z4EcITK4o=dJzZ|^-L0Q3+(|>+le4Gz`}aeQV((%C`_V%ITW(<s3yQ5Y=RuWgAG5=n
zD<d*6`y+>o<sQRrxUxfFdTiLTtfuo}%b)iz?=9X(kw(>3+{~4u0|nH}E#`d>+h=oR
z8x-VW(L<Swy!{JP&0l-H(XI^yQT~?O;8HbRCEOx=!P5Qib_=4}3mUDj-6%h#w5d0M
z$%4nx(MUmvzSLIT$C*_#$12P5w&77X3zZZZ9~1xKHmp5*VHZ#uW~7B~Qht~1`)Y@>
z0x2bz31M5gES(X=Zt^s()s7A!`vHBWEUv1b6?3gT)|5~#SnyoX3Xw`n>shFkfGZo6
z-j<D>RMd9@6+WIZ9U<nPfn_94HNq<p`^%?Bzl2D17nM>?=^}Uo^xP6M$y35uuW<WG
zulodm?##S`Zah;#8`n9@uQ`3kEEz4QWSOjqKz3kl_%VBhZ1l4CKmJg0L%z+6QmWZU
zBQp_!=z4K5_rT@>&juSub~}1MW#caFrSJ98i<eEVGCW#5fL@P=M!hUJ1mrYS3Ax*v
z@w*#JzLgjp=G#57U!u4Do^gk9hYGI&mFq=H+@FTw(5KKtTdF9#-bo*w+t1+^zI|Yy
zL<LXgU1jxSxEzJJ7ZR&}+%0-gIj921+z$RuBHjwWZ?XVWsYrJMB$13~(wVIOEqn<c
zkjn`^3UVJ%RqG<~D9~M@N!AqG`0Elcq<5crAS1&A*(NM-Xb_2ZZ^k}YPV?J9uJ@yS
zjhiM3|2{Zsky-t@p0cO&Vh{l+Q^2-*$R2+DT6(3;`fJ7dF9|m-9Lap{kl&_Py#<&u
zSMN~ugb9%Z9}|cTrJ}fUsl2727iYXX)=3uX$Hvz=Nv4u6XQ=(vs(L#{A>BrnMGq{o
z|5)ZTl~(ugR&IxLJGyQ>r1viy1I{g1Fk_ZhzLHe&J8v^Wok!qiXZD{My!LTJkTBIc
zl@2YD*#oo_>zsaq*xI9<KPI!)N}?g<IDQ(qGM>(<_TJBc+9W!eLs7p*N4QAik>^;(
zup-NvT>clreqE~1MmFjvS7wPYGy`&BR-;#6Jc&z;l2LCfB0-rCq8WrHx9@gb5w%}1
z9kheJvcg_<ndn?VDo58sXJR}KgiC$O73(VEt86074*$ULn?`yQAJ9}<H?BNVAiD?x
z`Zd3es6wen;6F<i|6H_D+~nT4b%%g1NQW+{`+BfNTi&qlfeLBdFwB`c<Jr3;qW@r)
z$S0&usZuR-vhTq`u9=*LnndC7>kKd#FZb%HO+(52oGe;7F!Oz%kfno?FE1-`l-CpG
zA4fwjcbpB*U$Z}6J-LegIB9&a5?Xall+uTaET=UwOZxqPGdr(cW?B-^w}u^(a&#UJ
zuH;dvzCnBIkuF}^zoi|lq7R2icz04E=~++|A4RfK%Ngwci<aN6#pa6slstcqM6-3$
z6hZvkq#_Mw>2cp+W*w)w6fCy34^s7ad3FX?)tdrI%HRujPMBt2yt4c&^85hbjeg<z
zdmrv0NHW?i?1;l<ne<F-5VdGc9k#CMFkri>e5Ny38*F?RLn{IsE*;vBj!N3*iG8Wu
ze5o+?tDCDU;0d`weZ{lvrc`rN+pmAoO9dU0x>CCr!I}as!}ZhM1)^KoPOH$m-C46@
z*NxLtrW-c({729;PusD~*s|EiqD?HF*=eb{7vB;;e4q+L@ILh2NNGkJ<@>2Dm$3%l
zBTzZ=0iXr^?}sDyRffi96>CK9y)_&6gp{S<7YXFO#I3&OV0OQxX0sP45KF!$1K#3(
zDXrd>NY|-?8E&Mjwtxzm!$_N3MMs8=U6=zxW}>Yn?CjYlB@a|^I}L#i3GWcIJK!Pw
zz#K9z5O^w#A01z2^~K=&Tb4ZMPjiD0vLVZO)lGa{Y6^UHq|U-{pQY14LzNwMPNfnD
zf>lHhY;Yo2Ks#umZ(W}1Nm*r(A4av~<pb&7f4C>)iQ0P2Uz$;b4fj${KbTU5e-Sc2
zP;pmrdm#7oeY6)!xf+TT7By~8+7}Y{`5!<C<zRWpg*->z9h7V2A;`ClMq9OgCU!=X
z$bOm1267{fGJ15m#}2BbOjLT3&-tRd*L_XNkUnGOi<=AcKb|~(`C!*=h~20cRc@}`
zuR54+1im=L)r4xQB-h(9Hxes(>GN8gzXI=!WEVeIY2}`P$isX1an1|<I!1KX8dK#&
zdB|JhJH`(Tkc|XhKZul?n@8(GPVHdbgG^Y@6=Gt70+r*GaDrRHh^i}9x04mE)6n->
z(@RP5jfT%cq5_3l7C*^uur^6kA^5)FY!4fTEH_;H#ZHvXv%*eO;4PmDUKnUdwn+mc
zVUFOH(4*(`sP7*A1020AeSA6q77kS!ef1*W+&DDZsNN#g&)oBN|La{f`lZh(AJy<j
zTWN?d#56=Lcw(K-Kwh7R2VzTe)tSThSJ@tKHUI@IfC4DXTY9*CE@i)MM9~`6W~10v
zCLYh+`?mdw57A;M>wJP=m*M;Vm?P4m9Y^(TGAR*4&%pRVLu1b0^dYgdNF5BeDd|#;
z6QW#&K~VlG>OmhKA)3EYQNNb9cagQ{F<+qHPz{@IoZ}GPkg0Fb4gb$nvl-Yqh2Bjk
zOrI&}IIq5$587{vmC%Y*T;~P{*5yY%{o?PZE@C!<arrMAi&jRt5SxR-IHb&~{`H<b
zVlK@Joz@b1T@k3|n47*m_P%G1LoEF5=aNLvypH7%dzG_rV}v90*@~s9M-H#G8>v4C
z{Jh10l;SB%Fntn|FP}2Cq*yvV4_}$GLhvU6f%Ur$4iP!}JoCk%8>jc}uol#13n>rl
z7f<i|*m8#NVUKv+5!_RK4Ik%<SjaKBu&H{F-pom)zX1FiT4XCq@KC~E82HI=b1W`D
z!xn>xL-im*tsys~-64I=gy|Libq6z6{z!3mm8*l>TYRQY-s^RK&9k0N3VjNxwV+Tq
zmwpyfbr?n}ier0a5bQ{Tn{}=f8V%j;m`GQTvXpJ7K#bA;?@W3+;ycN}wt+^f?g`Jj
z9Aa*#5(TM@fwLyIB<Bfw^U_xlkCwZiKXr@0o(fiko?f>cBidM}3Lk2pg*@==P1}#}
zu7>K8`j2=&tLtwYgHl_M*F#C%mdWbky^mpcHZAk}^%|(V?75_D41i;`6u?l1;Ib|{
zoEs-z*n8gAUlYtxuC<ZsGkSd7hwZ7~F8&9^*;Z~mlRoQ4i5Q8(@kemP`zGxBcZkQ|
zAH4Lu>MEm<2reYMb&dl_%-(d$%TnEky34xeE)VFNS6m6;lzg0)6LarlryP6R-or@z
zf@sc{nqWR`ODE{aJ(?5(Gs1~r4}yl-x{DuEjft0`%NvuX&#O8QRJagyqkH?&MOs?<
z(nvLhzUTGpYzQJ?w)!BjP;6UPdHiU*{BfkKzL6dGa^fVucI<xA0v?tdH$s;)WLgh7
zEk`N*<#&R@gGWPjc~$zbq`yXJj#<O7&*mWaohuMLW=V8F%th9$V|o50^=Sc7W?eU<
zZUs$~^}R*XH$!quskm9DV3y1y;3u-D!pBt0^Oqk5W_@#HR;`I(yfgB)O?M-Q7pIY9
zkfqV-bk+ayWt76a;daf_pmsdLR0Xfe^K9MKp%|LD$BU3gZnDF=XE|xVIH{)l|4Ul;
z<Fd0l9Pn%9m+tCJcZS_h6ji>X6E{8Zac<h}j{EAyaUkmYC2t_XolD>!FNRr0yB-o*
zvo}1cexoS3j|Qo;>@Vso?kmIB=D1uE5@^$NycP5cA((@vkrkU551~UGSaqAD_-0L`
z>y`qDY!zAU|GF|&ubqukG-dX-pBB!4fbcdU)`{Elfjz^(&$MuM6Z=Jg+VM$eGtbpi
zOhw*3mXln9zvMJWjig@%D|7pimS*u(M{Z#LxA{u(0|E7omAbUd;N-oUt{U-e^IJRl
z{&GyqzM80r0C36Gz~3c5dWO%LXl?IeB=!L(tQaELIk(eK*|Fj2VL*#tP@ax81V^#n
zq9LZYzvymYF?X_UR;f(exzViidtcpJO(<m7zVS?=-4;@8Suf`}Bu5?2)khr*Pb5Wl
z#>}jHf%3th<$G0BX>~{#ncMe-ABEu^pkmIlB9QvS_S$`XyZw;zASFT+2X6kFH1zpn
zid665yy@T6vcGveqe-hO^QM>lDlmzD?ynL&R{eR$hg{X&ekoY#URVT!VSL!u`Iwa2
z;<4)MHGn6fah3Aay&@S<baqhg%_(z@lJO-UjCs2q1<jN9NUxQW{b#L=mIZE^U$6fe
zB!P+Kob?-S6T+(}k4TzlQF?J)4DS%bcP7k~g5UM>9JpOY!s&yGT7G4?QxpgxgB#xo
zb$r32e0GL%ER={Ds)M9GuwD9@A#@BoK@Hy8$A370@O|;4<eX9qS3lUQ>Jc>Qzm`|w
zG9b*P8vNv2iX=qZv8z+VEvpo>Z$Kg2q@Ca1v-AqHAGw*53jB%AdT_O9xq4PiJPlXA
zgKMDA``tNEa~^Y<^g~r@gjj#+;i_8j*K<-c==_n}6-7S%7Nf<a%e{A<xa}rUu!;4F
z<CRZOo2dwPj7aHy;EXEvlnj%e+Ku-McP+nIPyX=oTA%{sHqC3FHJ!g1a*XdIfxXYR
zZw-P!3Er)tl>*IR^L1A~*bOOtoPqG}I5!LSwEwS%dk{huYAxxs4=%ED?!*Xflou+$
z&@UXvT&Bw*nyxNJmZgE1_3vRg>-(*)?U(?7b?VV2vxa7CYK^@Y<EAelxkeN+*(O&$
z{W7J%Dxo7Yf;rLiTI(Eas!4B$JQlyd_UY#8v8=q~B>cUvtm;v8{~TG-<|9vYlF>{!
zj1;xEigq0!wWq~8`FT?!jqKV~HpSq0y}x(cNOH;r8J7g(7#t7QrlQh^^LhQD-~3Ro
z*n6DU+4!eW_)ba~7_{xr9|Jg~L$c;m0P?H3lIzjiBN`>oFOm5rMG-a$(-o+ZKBeEu
zB}m0_sQ<vb0qQaC6cg=sJ0_-?-(NA(M83^%_95fyn*~mF!I+EF#hW1-{U(jBrCIJy
z9jJd-x(ufmAGKBd@iMa_z`-}p?L3m({@i!Zm{Oo88Vcim-9r(q4-FDe2P6<6w$+6c
zjJ)NF%IVJXH{TqQ>}=yX_GY24^{m7+ZP#`~`Lem%3F>pUK}xti2vX)Y`$6Fr`!gqQ
zn*O6mm*|)ZMd>NE7%IF=XE(S_NHQu=I-h*9wJjG}nHA>*gg;ffHY{ZE{f>oEQXk4=
zVW7?YRUKaUh!HV#yn;2WjrX8V^l`<#imjLr5Cr`QJ^DzuTJ<)?*d;o8!ewX~M#%1<
z$yM!`_7A4Wv%DOiZ_g6iz?t*=x)qLVGfynWm5dOLJ|jJ?%MkL94?fu2w*fH6?63m=
zOW^XCq1H&HF9>G1s3eAPdD?xx7_%Fok4t*Q5lh*t$a>@blZWGkHa>!<;S$vIJm?#<
zf~r*bj*vq5&gk}i^)lgAiO_pF`j8Dm><&q3RrOFtvdnMr$}zS8XKK2Y>Zv7DNI{QO
z0z*YOdS?a@u8&{z_<Zp3>#w)wK8_{T^F@>VKQrW)_k^ttPw!}QVTc7h9q-vHFEMp@
zZ{e7Ed%*;I?4lff@9#I)m@1lF;-^IHA5G}YoF?WILf4zP2zE>enF}r~dp4J<idFY_
zn;xs$<F>Cc-J5BXhmOL2u`q9<QFrd^Ib|Pbi}{mX@3&IVfUfe1Qa=+dUf32DQH0{m
znle9}PBh)5vRREl@QPDEQBc+ZQ0wtWxWa79!7Q&cn?67JtD{Nsp)C11i7h@2L26tX
z5z#zz9OR$vMWa`9<JKkb<-mrmscd)^7R5DQ=IArF$d<q$Ft(YlfG$}oKS9KO3=ABS
zjC$d{pJel8j!y|ATZ?OZb#tssCcA9BEST+yIUYebE`h)BDyz}nVon7oZ_sLs;{7$Z
z5vikS19#U=^Ba_I7<4)J&clW2)fjvnhy#|*qec!eftaW$fv8MB)>p&k{5K0}`fSQz
zteeKj9LJ+Tji7fp4dyaQ<;>y_ry*U$PrTus@$H`b6!VPn3_2`K;3L`4{Ca>NHxIeB
zkuZ#O33vu0<H8`nHjMm_+{EH{7T0)1&*!tVRF*p}RcUVI+0hUq4x631nx)LN)Eb~?
z=Kpb1cpv0D0lpOS%y<29bTRL#H|r<{Q9jq3ZApP$=F?=x&e0%iAJP7KF(n^o$laU<
zKvd_=@8X5DpyJOlGB$!@db)#I=LLPUNQp2uLRYu%UdX;54;UR7SH=2u0&~+s4xWW1
z9bpN~G4!_6Ka*@-npe2}R1xiW>QctwW^C{`H!+BJ=R|(X5e&W-+>;7jZNlP-dW-wl
zCYr2UYOXTpuvD{53>kn(pEV+fYk55M_wD@dt2U{eUsd~VkPbnn?mVf@JbJPJa%NK|
zjDkWmX~PYC`59OzvN$x~qq;JENCUv~o{xmPxl5|%Osorw$A@F~=H^gh&1%#-3<
zF7d)YnShSH{Os~MYv;q;=&Wsf$YL)Yx4u^l(&!M4cpEbOUQWM0r;TXNz)+;^atkYp
z@x9Dq7<cQ-HIj(zG2m^^M>N&U2jv*FmS)-~#QEaMrGv=|vT7fRgmAyc)pdXWeX7=X
zT?VqwGfpfD@c&RqyAy3XRdS&W{PuQMcW-M+mv_X@qIpHscjs|z30-X-bu;rkJ3grf
zff9b|><}eVwE;a-6=(kOE`pC|>jC{aj2k`prbZ+c*`hazTZ~`4EylPPBpP^~bf)V0
ze6yZD9iRF2X~Fa7N(jGz-Bp~T;{YB|TVhDlGiFE05s!>rqP5JJ&j0S;{y<Oe2HrpF
zE9j(g9FY}7Pc>VUn23DvTB*O?_2bT@tifxn^x>&R?wPS9wy0}8{g>Ta!>oh*eRVvJ
zO*ASM7(rGp@krM|_%||0LvT_*c`jGa!kfG}aO^y)zhTrVeQwy)8H(|i2;o{81Semd
zJZejU_=6N<7pS`30>DcWeG}Si!>7YFSr~D6vOTK*q^w$dvm*t={bPn}4PkQ+mK5F2
zPJ^Sjm#Ibeu6gNO-ZO?LTV%gF?i^80e!k0Hq>N8ZyrnKzl83pFjfw3eh%7Ih<=iDY
z8MSGV_{ghZ=0+qhlUVE4h~KNCqS#E6ZS*zO+Q*6$88U;k;U#rB&Yj?<x`s_XQ}i{z
zF-EH#KIh$~#*VlrA<ArQd-?8DFSR?xBij|Dw1_K&6`FgKhTha$cxjV^4o7EOL_Gg@
zEdc3QQhPjiU@=+GJ1l2ptv6~>_@TNOX7Ntf$(pdawDc-)_mmoAgOUzySf<@GfA=uP
zn>+TU8#QtiGLk=XsR(hKSEp_9%+rDMGML@KyNgk4X@EX&OkMogbN`2O1O3y=B^0Zy
zUw>xLO)nky9SDZ|WsqlVu`yhTe4y4G@gnr}D{+*o&Yd<qxv(<gV$Ou9PuNpwMy;@|
zG*ckGo6ybVev;_~1HNP0PUq9Gx_FfbL3!3xZqD?yHqz+Ei@_;pR-0#4mu>>MBL3?=
z^Z9JzAUZQuQ`EDUV?yZMPNAqc`EA52HHzFZB>@DBPBJ-K92;3>_yL;dljP2}T&Mkn
zUuIQzyaqu3seXa3_rn26)roF!i<d))H6n`CFLeB9L03k64?_+^m6<y9)O8&S+lRrw
z1-;N!z_ZqIP>xq?X|wGccU_*&st4_af3VY#$_VlMrMaGMD4`fkxs!WxwMpt1l+F?M
z8%i?geUnE`C$^bEvl-1vTL7}13p2o~_UbnK+Wok}oK;7clhg8kF#CI?Y4T#43GAjT
z#fe+>w&IV@vMI&L>X#}}2)28si$~L}<fVkFGez2J0`TyMWdx}PWuAKxUCgR3$}0(G
zx8k(c!Yy$c#qH}b)eoSa+XT-{pU|B;`DdN}DE03?ER&2#ew)UJqX|c+UX1vYL32m~
zUuH;1E0*&#d~K?HB&GaJ?Kgc+c^q72U2IhnC5(E@v4n$F`+PMu?4x<fk6^EZPd`RY
z=HIl$zu(l~GcSVl9fhG{#^@YC2f93>1USD*Wuz|7iN)n7!hG&JasKIZATM3Vuk^c)
zY)BVv+VD~ZA$UE>!h}MOF={_1;$4E+DieOWi-{jePXOoF9wzouznQunSbU)VHdlA^
zzI=tR?)P7515fmg7m?SMp;gA$-)5KYb)sv7x#sVJTcWbhl+vTY%C%Aji`mG-r(Rj|
z`O$=r?_eqfP}kxSxWQdaT1#=q8S;Kd-WRBri(x9L;O3g}cLX$fZW?zi`5NU>$VM8R
zrQZl}w#kxUAI2b@&mx!Xk?IN=>a}1&3}KiB@skId)rrLKu!4h{@ToHCQz;Lb@746<
zf=#8y4(It1Vuv%FR`_MVtNmueO5Pb0R#+#yB=yK$^YG%gYN{3#X$^6RjUbGxBJ2mV
zNlFsh(4osSJ8I=A;V}7^z=^f+JD1^vQT_u;I8^{Z6<G1!rzF2+xG50*z0vqX)zd^W
zywmsiXT~)l@XVBRq|QKjI&^>{b^CYswKu7MH<`!bx;T2s2G)a^(hHcQYP~eZ1QbgJ
zXSUnW(yV-DV$>|UJAo&$#4(n<YheRX6n3Mw`j&{zp)fHF-Vx<lTLrHvqdOt=d1flE
z22Qy2zOXe_9D_qXpYI6KpfI;(9jaAr=rNyN8@I^K5@H_FLCXlCCDpubaD8|+!}w*@
ze!r6l8@3T0+tUm_u4canS^DwoQmXFhebv0qTyxbbtu`vuLj{#EDzA6szN<R0U4X_k
zR24>sMpmuYGoiOyYac^=!geo#vIESvu{kwD<ST&;RV=W}h>Fj(5J64Wh>c5`GoKYA
zCk&q(k-=RQPYiemu{10?(BNtc{v4EWwYN2`(9a==eA!<XlHa7mJB|4+(AZth!4MiD
znFLv9#7c{GTNvG2{VNjrDo>AUyKNd)#<<_pV%mu0?EuSInX<l+dG);mrJU<ohQ9H2
zrB^$9TqpOj*L;nn)J!!8F2UEM{sd1niRDt+70DvOW?1YcjBm+2+BbWuJ|~m+yQ`0U
zIpiyI$iqX~<;+Imr+NDMkO~2X;IbJdmPsM`@m~T3dyq5PdjDGMAgl#5qulsgUXe-%
z!-B?n)K|~rMlLr5Xb7UVyt;cBG;jE%CH>%{UeLXQBkv_@D3Ptp{A+T-K1^@&AvUZT
zuXjU6L&P!292#r<pv%*MfN*Pa?MT9$k!~H#9uV|`fF5{HVsU7*?k%hbSP%*ocNO5)
zS(H;0T2z7{LMlw%ADrZ7fW5dXZV8hg8eg7cLu%fm&YA-1K)0x$RB#31J>H*tbTRQ8
zpKshjz#|fb3Hk;<(j(=p{U>r4mH0a@NqT9cPWEjzoukuw^cjtP(138}YJ_BX+VueG
z3cHmQ40)i@=?pY7T7g0@g*H{4hM*NylGSF4oLXD66V5TPw}9uJ@x*)Cox3AOB(E~w
z4p4mreuoy7=DzT)22}<$3k)T4gu<IiSDND7-ZBHq;MYtsUgehgbz1`QlR_`t1TNgX
z;ac~xS65kh`<-K$A3xI*y6syNg*FAiV0$O-HuD>9H`;)uHwMH$xkryPBR0*6%f8f|
z7iaIuJ*FT}IM5)1v!Xv+DG?tO<oiLNr}5z|>-L1Rib2HL{Y!1^pDtZoq;XVg5G=v)
ze+ZJ~V}zBBMZ;dW-z733SYh>;#c;=yEPO>&c25l$@#VoQ7{mwe2oa9o(VQMR`H+0l
zIXof$G+mP0Ue)P6N~08{vyvoiCum~8J63+|%U9Z@i;H>jkC&6HR+zffp**0>IAeqL
zUe!g`<{aeCX;AE%4ZI=iw$lpWLwwbhr*9M|Eb>JK8F10K2Db57zY`8O)mU`cGXZ48
z(!j+RatLR%eMUwTU@Qdgg6{}jUnN1g9|4#4uT?;V;UiMO<^*ccswcs1rtZ#BA8n7<
z(5mHViot3}PY;vG>JnNKi#cHs-^v#Ajn+P!Dy1@kYBg{&0iGGxhT8N<7)S&G9H5A0
zkSag~fh&y%v+{Ib>-P^8C$IDQ@>&~Q!OaE81yT`fuaTA1I<Dq#N&pllR6eoIVC=K&
z*dgx5oB|Z1cg$7$Dg2&9#54{egaA;k<9k7Dz^kL`<~q9KPgmeC<{aY)0D$JP)!r7n
z;~BJ|UkxA`AYB;DeB~loek50`E|)T`L55=d9BMXp56DH;E%>8BMGtc2`5ZUvXfDG-
zTGjsIDLQNSo2Fi)Lt+rK@HW$T$n6mHZ0GVBl*whOx6WYjyz~5%vW)rgH#ES>#QVbO
z&J1RKk&bWr>?v%(RA8|6edAz!1I2Pxk{^O;4m$4hJ3+!d#KAbg6vqNUNA~wm{`AJ}
zu0}wy0tRJ2NvP>OHfZpfNQ-PiH5n3l>MF^Z4y&&H56RMoTOGM3H^*j@Mz|jMy9$My
z$}BqnbDzThL_#o!9qKZXt}p%%4kBC)5)|XEg{emwb*$)N?gGe}-?e|-Q{DTPlfs?g
zcc0hkH%Pi^5~}@Sw&u7dVY-zM5)8a9xwmW0*U<TD8_Q@7)=_;f+&DD-L0mt4QIPpN
z6%q+)6vSHDM|9q_l3r`cU|`;KL4@%6>+>Uh&HSP`L&sY;DJjA^KS;LQt}d2x994HK
zVpXRFPi2;snKQ<*R||IOX_I?N2DmK?^`LeM$O{Ct0|N5BO6YJrwlGnk>qJ_ABxjb=
z(T_V4_0I}}A)H*=abu1Ka7=dlt}7abl9J)oOoaJ4v3#8*1V0b103i|X^l+;!cZBHY
z&z}O0Ms5Wu9Nm7ZqIbPIF{*qy>Ye&#xqVEBsF(7T5<q3W?co{P=yBcO$TFfIY$2<F
zcZX;?6QcpU;ezdvjW**e+Nh8wC<I1=SaqXrEY5MD@rXGt%sx@!RMz308}Xm#O`~?I
z_@aFG*y(9D(unr9HnS_bN%FC3%lMa)p*pIoL?_sqDrhgXa^}8t)5IH%GGNG_|8qg|
z=mizu6sqg$nJgF-vn5?>bsle74%~MtdCm8P+W85sWiyH?wkDzT>&)#>dd)6o?k;+r
zZ&d81ZyDr}-D7o9@qm^2Qy>-tajS*U-it~Jmq&#Hz_!eoA_L`lQ;L-EgAtbxMKYE+
zzFQdU+OYGYDWaWn$dCvJ*l7uhLhKAr)722s^jud;B0D1308rCjw8}di3J(x*z-Phl
zo<Nn%Yg?fsQnW#aWwtdp$VweB!W&XUkv=T6La#b-F!4NBI0Q5_BcUF8FvlV&WoH{*
zmO440&=2gou`cVjq`7VZM3>Pdq>5F&IONqd==VKeS+(QB%6Q>4ty=VW=uLa(%pI%s
z7s*nxwds2#x*764K#!3kK()lf7Zw?(LxE3wk3%gUl;k`G+}K`l?I|F|qzs>6zcg!9
z>e)c*b1n%v=`7Z;cPY&eZ2bLCBSJEe2YX(jqWbjpI5ep^ILm?G1b%23k<{N+lZnJ7
zL!_T~H=l^`oUI?cO0`v(`sLn>LlkvOO;&Oe%VHq1WONAYhw++3z_W;XQp$wc7~(|-
zrku23l?Am&fFx8L0WSw~$CCuPad{cTE@q6qm57wqF+li{aFnIwYw7abJ_W<XW=xrA
zxap#L3=!N1tBa_C^Hb1*IfUm@sJX5r1F!SF8~tckMpgRyF*`1xypjgKEd_8AA`h}C
z2=J5{%#0>X!~3&Q<{(7ntlmM&Zh`(qh{2lqFxWs(HqFw{&ol@HFlx|uvk1lSoPi1S
z4R6^h+0=_&>Mh_>I?>BO9Mv}|H6Ubs4p@5+f2r*inYHW&;pp1Zgy%*YFjrR+#?C+^
z9wJac)176$r8>=X&d3l`YMcb4>8EOzpXMkw#t}uo@R<ezXRM{-DePKT!3&qR=R8NH
zF3S3OG#n1{0S?lt2D6`a(3KtrW{*64-CGdMke>`oX);gv^MDU|0Od}<#Z82uK!Ir1
z^WCQk*W<!1DoNkf!43N(_(;@LUHv4KE41N18=yz497qR0v~HOe8*S?HYX)bhCfi4z
zoan@j7W`FUX+5pRp-_9!N!_h5VH0}9!7WBaM0Q{tAURlp<M_wU?2~%gA}9<cB3R8#
zjnw%a_i;W$bIy__?&QRNdx)&-cOe2*>zC6wAxZ@M_9=57!I}z_Imu7uxy-YoyNZ@(
z;_1*2UOe(xQ+oW`qHsj=-Q4~f`lFx~*TPvfi!FiHp|V7%kcZnYf@#xs!1vEwxM`R;
z@bhbj8xNhOxA`9GdgfXg%*8dQK{Vm>O~Fqm6&0T^PM=lC_GngVHEOt~RCO6)Nb9hE
zRwjTfvYTqcVbe3<u-E@}*i#fFLK+n^1RIk|YuWssUEo*IX14l$;ftW}07Fnsjc>w1
zdcrM~f)sq!f1?nb&_-2Z@JvFsPR(AMqN_>M&&p!qJ6UiAvbT2~rs>0LLt4PnHV+dk
zM97r$A%kDw)YB;+cjmP6k8I6|;F@xBU(7d8?feoDNo6MVph{W$BVn;ZrB$f`AM%0&
zY8IT#u=JO88#0pJ2X<3c_iZPE=T!Aq7-@djii4AKY32+DH@*k1DP!mkUcvu_N0g!D
z<e*M>VJb)Rz~+nc=CP(xGY9G(j7>DE67GJhNi+#&tK|T+-f;Hw9<sZVh#~|>R9zQ8
z&2c!ZVcUgO;0?)O)5tO}L8v$nu(VX1rWse^8nL<<6U0h>(Rn($tf6#W-Of;Q;eMm9
zq{zy*L<q^z#1g`A<|?Vl{an(jY^eF3aSWQuQwYz#bCvTGIfQ-$71(TTH^#O1yM*C}
zou8|tuWnVL3!Y5**i<k`AFgot)wEG93qw97`3&g#ic<=~B!*%aB~Z?s`n0YolTT*W
zb7z#Epk*VQUvD`Oz>h3FYenIA2w1jZ`4OV*m-w7|Cv+Tp!tW5y>#tSdii3ssoomC!
zMq5aIbciYnsv>tE({#sy2tT+<GB1b+$n$IPjMe=+RBP9+b}2o93J1mq-~!^sLYe08
zfZy<@1oT97UomjW^Kh22yQNg*98VER9qZMC%4rQOQ%>~}S7ISYoGped3t!68q=YC!
z_qCQ7r`WYl5+UL;Db#e*qkj<w$-<)Fdc0{wUIjs7SNdiOo!?DpNCKzH&twUz5&}1-
z4HK!mewXfrj~R@g#(~GA4_do#ZpiJEO`lFUe35NS+TitL*yLk-qT{Qy5xA@;Z?Y%+
zpo)zk;sptPb?UMn&#rPe=9rk@4tonZ5)W<s>Y3P9Y*pR-#iH?1&SWht%SbX?0ZG3D
zD*0#u&3z!mIn_%~f?&wbmEX#`x{4BAPe;Db3UDcB19hK~6dH_NTfG`UfQ4hU@wV%c
zO^0Ak`^MmjcU_5q{WX?)N-*Y52X7u0A3K*BO8UU@V98%n4A@4_7_*z^JxT(mo7Mf{
zviJ+;?ZiFz-|nThN10);c!*Id2-1`YrzXN*bCRjG`>wSZUZ$MM|D2;a&X3%ti&<)s
zWt{Ylyz>5c6jQ<Psm=Rm43%J*muwu$&V#t&Hf}Znjx;H~*C9|wGAqQebg12y91faD
z6mR+%N@gMQ0<#qy_GnfBjpyK5Wz(dco6vmOwxt+DSH?7)FW?59piaT^rV)*hj(9!K
zY;97QTIIBd@DaU$*iR{ws7%JZzI(9M)@>=i8vo<r{;ov$$V{1MC@>?qLH*25i$|m`
z+=fdv6UQ?TOG#>3d{`zgDx7ziIU(udwLCH-LrGd^vv2eCZs!qio*slY*RN-(@RFcX
zG-=T+vd@sIjO~l<IgU*r(cN#Xs>OQ)W>d6TflRqU166`j{Hx&eO9Knqqh&@%r*o;u
zPx@K4=R1sAG&UPIe+o7=3C21t-t5|f!Ql6(@Nc*BsgSO?!M`H*Dq20qHmm3jWPP!t
z#1nq;i<}3~Kp)BK+eH~KB~^2M)CL=jWL3S5zY~gj@Y%YbDc2IY#N2Nx2}U;P>syjc
zOD0G~A3MKPza!d>^^dKNdg0pw5o3MOx1<wHdDGB-!FteB3qFCQ^1YOi;cH!iR2|Ac
zrK2z9g+0&vqohd@{UVe=H~g4p<Cfu3Fitg*Sr;N_jBdnc6x}gzQ)$BrzyW^dG^#s(
z<;gQUwo^+qT9Ze*)FDbGbzS0hW>QpXEoG7%eMQ;~4wK88xk5p3MkyuFT!fP8T{g?v
zfnXRQ8&{@vO$AR6bA)_%tSM^VJyt4H*^Dn*IXk+s{U)k!wJ`Nai`rL(FLgSWaZaCy
z*>ug|HgctrNF%VX%9c7GTIM*8<%G9c%!-~bS5a0?@JyWhIKF;!Qv--J5Bi{YJ~_H|
zmrw#H5dQQVlvKQ;A8D-}rJN;O`T7Az4mJr6#b`{cF^w>@taY1jOZ`}e8Aarg#zLBI
z-|eT<+x;}V5Q*kA-2G3}zqPHG`U<-E`KP{Q<f$w><h)Ajfmemkj@SOJ;O;aj3e7mI
znzZ7l4Hu+`vahleyl0BKPURnyVH$Sq<)8FfEYQ(6ZwNR{T!`R~*jw#FfzzQK??+uX
zYMh?$XjsrveEij<4!<e@t&*vT-%V#j4{O2;2ntW#PFNl3guJh?rx^x^)dtsVG(wYp
zT)f=r_hsL6bjQXvvfemh0A2LlyvQAi`o6WxT~06pbr&tNfeEDb73Cw{{<;l&3R=>b
z14jK5yRvvxqv#!>8U*pXS|@i>m^6a=%48%C>Wf{FB0sk@t88O~vo-4(`-~)+PtKa%
zaEZ(8eKLvD(%4+j6xy&ctabUF>jOnNH$X>~CX|szPO$w=)iaUISC`dW85(cAhkpiE
zU(2$sch<C^x&p}#d?J}fwh|_(DjmQKi)Aj>RVCX@jeawK_e7)L^}%w-*znCB-DXZC
z9KB;JL+hoo0@f355tr$*dX-*+UPfF~)ZBX3*P!-Li9VOopIlZX$NchYePb+ZDAd<Z
zYEQS?&XtEenAwt1NoQ$EF_K)5^?di7mN0^nxfQ23Z@%XWZX;EPbpOt0D3OcjT6Kk0
zi>!``Nm4cL@`~#%;I;#lvd>lOo;}5fMY9tkc4JcJQ4;Yz#eDhqw%_k(be+R%ez_T4
zia4R(syouf6^?~`0*n!W_Pfz?4nd0V!UQ9ggtJz9=x}d5t#S;rE$+<}C?Qu7JoAU|
z03=Pas;+$-Joq#DM@QO7UCz4v$S*;SUn7^F^oDUQdaleihp&1n1#{yK07OSbfb;%3
zv0M&{hs~GQL?;c$xj(K;Ny>r+KTtmpnt+T|is}09ZwIAw9~BuGdJ+CwQjDPvUo>s=
zipSiH`3;xG{Kju7X)U+sBx+MHcRYCc{YK93!TE4VZzNLssetT`%W&CuU;DE0B00WG
zEK4Fp(4Scw8}LJIWm}yANW@YsPm|Si4JL;?LJ4nV*u}R>y;ivEMMYR_pFLNzemBf3
z&;&zR1yVz$tJc>BD8<yi?3waq{_uCDn4Wumn<xKn<9NJ382m2+#kY#?dr2g1NJOL(
z0lxD}#RMZ^bb~;bboa);M$LkXQ;)n@%MSFWHG96Hgdy93r$I=AE`^}tHxHyX1`F0y
zQ_DOR`dN?1v8sYwj$Hsm&2i&z2odnt`KXTI4Gt8hd-8l0z}#MltTU5g8g=%UA<8-B
zh#>Niz(<+_y8lwsdwo{VO7c_l`Jj!(TiRXee9RK~W;#?OcHQ{LL9`Biz_1T6;X9Lu
z@Y#1Lf>S`<{&K(=6E?s56w*XZxi#Ac@*`bTC<PBHxT`|nUMT6?dB%nyqhwa&dCR0I
z$7kza6O_B$%X3$!dY^Lc?t*~ClTU#IIQsTzNb@O*^fQWFfg#DfPo9)JF#Pnbu_{~l
zE!$pRVdF8w_@SA(ad9-K*_#vhsX)YT?QnXT?aI?{HOITV-KfX*q6Em@TRy=LmBSSd
zN;nPLY*9(AuwfbO<x#WMN6+z#1^!`@E)99=t9DWvqxpuC`hrLmU_FSBm5T5*&nuP3
zbYp9auPy{_%zQh^{9xKZCA=o-6Q7R|dRdyjyAqJ?e=+Ou1^Alo^Q4at*n&moop~C3
z4`7VnZ+?%wW8QO{W$yEsEc0Sj<h*U!U-^6RZUD?9b5<=RP}I)BN}b;3cK6S(3+Hyu
zGUwRk<Ub|M?^3#gLJ61oU|S9l`sz}_0N8pFfKm?zHfd#v&s`jT5=R@9_QmBQ6=-=o
zp`5=Oa&CW5&bDfqvqdUp$G(qhqPyY{`aQh#TMdch_d=+=Ayz#ajrvMIcm@|6wZz1s
zl{j<L1oqE2yM*Qiv5G*Fw{FUbKi8@8g{XLmpRUiF<{90q@Aj)Qpd)Hi54$aI5%-xT
zXscE~=dkqEaS&X@Q(-q<sA~KecLg+YNkBfCra}mO`ugMpm=6$+01fm#$&PP#dHG)p
z2e9c2>ZB)2Bx>A2B1KSIB2ix}Fd2dEhrw`EJE)Qz7^L-}h2fg(yab`4-^M(4?x*Nc
zk6(<Gnlh*yo&9*2h!zDKot{4_GjS7@#G0iaXYX+i#M1&Sa+%Htd4GY${_3z9HV|Z{
zB=y#Z2i8ijZKwU8Q(!B1;0v!<Q{WqL@h5aoA=%j7k_^Cn&~VZOYC7Nxt-DpmDA0)@
z>TlR8_r*Gi5-`#xl46gvZ#P8Y?}ccLJa^-7y1YEBw!9ru<M*C8ffBA4KGuj9Ln%V+
z$BIy4^_LxPYqz61x<+q>&*Z8zd?sPEKUO)1(x{jr^lFUr#DkgI0>+V(iCYO(ss`Z0
z-8C<({XK6a+597Bu7VetLZM{U50`kAM4rNlE$Zm9H+x_aWNwjXw7-WH1NIfrEq9|0
zjj*7Up#$vH*fqHyVbtZ}`Cf_DO{cA1N<2PGmBv@yiy6|yj7cP>3Iv#)Qoyv37o}f{
z5akI+hjSbrrJ7eQaE1O9`BK$Buc8_<y6@Mb|1&oV*^g0oA>16iC^&mjuzv1K-E$Fy
zae0T$WYz3OIq!1{9PA@F<&mj$LflulIe}qI%jbJ@DUDLhJxX7RWF5j7w`_m6l-j=6
zRQ>nrtm1~6gkz>Fv*uf4U@uqaBg#mGy@^VcKTOSn2^!D(z|ea2oX(|^PEGqElj6^8
zT*gF<d17w(DErs9)0UPBl7&svqmnDN&cyk}eQbA#JV@%B{Wt_0D4tL0N8B{{7*v$i
zZ-u(b`c8F6Z(;|<5JJ(j1%>4Xc1jAP3nkz4h#G`vop-U%Z!j0Mt6Q_A#|lUm5-Y36
z0{%jh(QmT1!dI}n^rmx;O`yyz+tkNifj9sODiS?!J9A~vH^k7{94D$u-{$5N7f-N%
z9$Msi<NS;Ak`ntd?+~FI<+JCJgxO>Or40QC4S3mRVNXs!4iQQM+Myf}LQ*9Cq@|^|
z;alxfb?9kJ`{YE<bt_dIc`jxnASeByC}7yGIMrN!p>kOjtCHJO{O?)-M`hjD97BN#
zk;D@M=RZe~TL|6={v}Gr8!ktb3Ci~ZQ6Xymg|O@G-H$;^fAL)CH&OEqAEoTuU-;4H
zn%?EZ(d+jQz4S2I4j<2TUgwzpiD8Qz2?`TP&j5^w4L$nad`jsd*sy(Yb{%<(brh|~
zM8rG}VW<~xL3MwCoj)D%$`i8oj?3lW7}p~U1zzHcF0v!`&WyF4=|ApAgfFb{Z6#o2
zuW10mY;N?b55~KXY2-0uL^UhxCUN|1zDb{8#R@)P5=k)ztw-mK;7=#o=R)%<IS5wi
z#&vcMNfcF{EsL(60r3~-7qa7i8c7zq{`9<CQj(EsUha0#!Z63uayD|kRPL)=tC1BF
zoDL3=&HYU7MrPkc>hzIKgQFv9NpI1&cp3DsroPb`N06I!MXXtrw$r2UqKQs~QkrWm
z((ekQ%I(Q&U&F)W<0Z?SlGOV1OeH`4u94YMB5%G3)ItQrfV7-5?D+<|y{*YUqm;(4
zpP@o*^F(pjvrtulf)+WjRh2>F8_yhMmp1pL$2zdYv`D|<=xv*~d(21UHKtLV-0Uvs
zYhmb3B<=|+ZZA%aEpwU?D@RmyK%h%sT@*BdFTv}JDua!}dy@q4PhmbhK`kwf@;WyX
zN!H}n7rsA~%k3Sk?){`FLou>ZhAE_ItNQz7c|%!Am=CeqPowZmJ9+wR<JZ6N#qppN
zEo*Od%Ww@mx5NY+ubyY=MmgAvnEPEwR?$KM(w+5(<5}-hGA_H)b4H9(P6aU&A3y2S
z{X0naYuX(1_HX_i$-<HJq2qq_mutqTN$~N?EQk-Y#eoo3-=8HbiVQw}+bEDuwWi0F
z73T~94&vp&JI8XWZ-)*>XXZTbOmhrgXCe649WQ;sep5|?g8E@e$w|ptjYfw!$>ds=
zIWvnGQe*Xu2hWD62A9KA3PP_gSgsQL^Gj<PNux2>n~-}XG$~P2ocYR0_(1Cb2#9JS
z(*q^K84sfh<D%Evf9E*yTVzQ&x^T5f#78l7KQ1ZJKfb!O<6D%fE@)GqzsQS}3&DOB
zC!hVkaj5<~DbDZep3mI7l#nIJ4eR70B9aK+QKNcui|Q2fs=3u=b!t~2#9CS3rwj8Z
zaWbRcf}e}^bK*yxpZwq!=rYiKR>p)0{)<f+sZg@+?UvCh%KS|@cpZWc1TXPyBAdeH
zC)iQj1!qJ;06E}j__i|i!F{&}A9CKO-rDS)35bt({Sx7kW^Tx`)lE$8xs>GJx)-yd
zQ7(hQ9zYkpID%Vh!{M+@y3=rS!eiWREomx$6{u^<Ma(oBWPB(;$AF?pT~^}r`T3>t
z;bw|?=*{@aGZx(-3JYuUvsI?PwAhG)AW39<u1ihVgd#YL8;K8P$-#dKE7RIUJFimg
zhn069?0GX&uH>~n-I;9F<=3aPY5&TlBwwb04<`!Zdb?*>gG>!$n41M|H|(dx-h;dO
z*CU(H6+>cUYmaw#{42a!NIjV~mr|Df3Vc$G#eOS<Zep6GOG-&rF<C7n@nQM@*YB#h
zu93Lf<Ku<*_SDZiw$Eg<zzb!0;FknVd+vjK_^;DtB?{V#WJhI+AujBogZ_CmF%Eor
zW6C?^`SxJ-Gk9jg!yOn3jdv=W6xLEE$FC+ooP6$EN%imjA-Qpfhs(@?9vjogi@Ehb
z>c2h>Kl+UB#Ei9ovblCor>L<K$l1T11KO6B^{M82--k9;Jt^^+_GQ6Kf&d{7x+%<d
z+gowY|Ne{M;|-LQc$gb{hhHNFyv!Vo{XUHt8-x7HQfMiAT^&c+^`8ejP*+iJBKhgG
zYeHmcNy)-7YQjf!1ETfxJu_={l9wuuR7gjoD&(IB+P7r&xpF9FrGO0xF{YdDqIrq&
zQwL9QJ=1ogLYd)T&k$~Je!tu-^3+-P*@z-E^P{WE*Y4>N{bIZ1VW@0;!Vgvb|2%{z
zMhOCJ9hNETZHq;({E-{`K^hLczL=w9W6|S8WvyX7;f8AUe;#~OSu_OKH8*->HW)=b
zMZbivpTjY0en;OxT_-Qr$b4FPDNtcVHE+yB?cYCw%*!`L!{oJ<pU^HA4eA$*(f#f>
z)`9b{L>11z9|8kPQO*;ooqfZvr&j)Ra1b1}>+rv<#$lEC;NS5@n>{#p&Pk8v=V-|G
zKhFnUAajLzaYRx`W_+{w$(v#KIy#!%qyBU42y!3+eZQzrQQ<MRSGYFOqyM>MP}PS~
z%Epc`^ooj%YW(W=T~ON}TT_)|N7n(;`#+DBCcKB>Xu8?_<^1g@$4*@M4>iGxn23Oj
z6Iy#OL*P=><&A4LwSGZ6oyLv-`#nM;D7_j8S#Q6OMIJKGY0J8}+y_Aa|2+7YsCa79
z*PYr9<O$+sn+|LKZMxR<V)^0Z^xLmRcPR<K`1*I&ssHnXWz-cpm2g-$s`cZXVq|>(
z?`1dO5W&x*2KP*vks%Sd)qg)&IaD|htA|Mww_mZ$3v&MN{UbY7|C?UuZ|u-}`LE{<
zy8r)rE}*CWujdHR^1q*ZK>GiB@Tgq;*Q1Hrj{kau_WqBM|Leb}puF&3&!6r5ha7*X
z^B-FL;hq1`;{P|2^`9gE`L{n5_(Op|6!=4dKNR>wfj<=Ze@TJy-z(#IG*Hs)`rSWE
zcjo`)+JDyiLxDdO_(Op|6!=4dKNR>wfj<=ZLxKN$3QQG$ibfEs;gh;rf4tNGd-DDH
k#UBd%p}_yADKJN&uDL(qkzWpUIwHtPJyYE>ZEW=a0qVmQf&c&j

literal 0
HcmV?d00001

diff --git a/thrust/docs/build_docs_locally.bash b/thrust/docs/build_docs_locally.bash
new file mode 100755
index 0000000000..d77bc97cbc
--- /dev/null
+++ b/thrust/docs/build_docs_locally.bash
@@ -0,0 +1,13 @@
+#!/usr/bin/env sh
+
+## This script will produce a 'build_docs' folder that contains a jekyll site containing all the Thrust docs
+## This is used in CI to produce a site for Thrust under CCCL
+
+set -ex
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+
+cd $SCRIPT_PATH
+mkdir -p build
+cp github_pages/Gemfile build/Gemfile
+./generate_markdown.bash
diff --git a/thrust/docs/doxybook/config.json b/thrust/docs/doxybook/config.json
index 56b7a238be..14c1ced4c6 100644
--- a/thrust/docs/doxybook/config.json
+++ b/thrust/docs/doxybook/config.json
@@ -1,5 +1,5 @@
 {
-  "baseUrl": "{{ site.baseurl }}/api/",
+  "baseUrl": "/{{ site.baseurl }}/api/",
   "copyImages": true,
   "fileExt": "md",
   "filesFilter": [],
diff --git a/thrust/docs/github_pages/_config.yml b/thrust/docs/github_pages/_config.yml
index c131e84fb2..c4a48ffa0a 100644
--- a/thrust/docs/github_pages/_config.yml
+++ b/thrust/docs/github_pages/_config.yml
@@ -12,6 +12,8 @@ search.heading_level: 4
 
 incremental: true
 
+baseurl: "cccl/thrust"
+
 # just-the-docs ignores these filenames by default.
 include: [ "contributing.md", "code_of_conduct.md" ]
 
@@ -19,6 +21,7 @@ exclude: [ "node_modules", "doxybook_templates",
            "generate_markdown.bash", "serve_docs_locally.bash" ]
 
 plugins:
+  - jekyll-remote-theme
   - jekyll-optional-front-matter # GitHub Pages.
   - jekyll-default-layout        # GitHub Pages.
   - jekyll-titles-from-headings  # GitHub Pages.
diff --git a/thrust/docs/github_pages/_sass/color_schemes/nvidia.scss b/thrust/docs/github_pages/_sass/color_schemes/nvidia.scss
index 4b44fa222e..6a63f85e2a 100644
--- a/thrust/docs/github_pages/_sass/color_schemes/nvidia.scss
+++ b/thrust/docs/github_pages/_sass/color_schemes/nvidia.scss
@@ -31,6 +31,7 @@ code.doxybook
 h3 { margin-bottom: 1.0em !important; }
 
 $nav-width: 300px;
+$content-width: 1000px;
 
 $body-background-color: $grey-dk-300;
 $sidebar-color: $grey-dk-300;

From 1570b18ad9c5e69ba4b6dcb6db68c9f4784cd036 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Fri, 1 Dec 2023 05:41:21 -0800
Subject: [PATCH 4/4] Update Docs links in README.md (#1169)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7171af7330..29848976d2 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
 
-|[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[libcudacxx Docs](https://nvidia.github.io/libcudacxx/)|[Thrust Docs](https://nvidia.github.io/thrust/)|[CUB Docs](https://nvlabs.github.io/cub/)|
+|[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[libcudacxx Docs](https://nvidia.github.io/cccl/libcudacxx/)|[Thrust Docs](https://nvidia.github.io/cccl/thrust/)|[CUB Docs](https://nvidia.github.io/cccl/cub/)|
 |-|-|-|-|-|-|-|-|
 
 # CUDA C++ Core Libraries (CCCL)