Qualify call to distance in thrust::async_reduce (#1904)

If the arguments to the unqualified call to distance contain entities from the thrust and cuda::std namespaces at the same time, the distance functions from both namespaces are available and cause an ambiguity. Also use ::cuda::std::unique_ptr to work around a swap issue: std::unique_ptr in libstdc++ before gcc10 used swap() internally, which would lead to an ambiguous call between std::swap and cuda::std::swap. Fixes: #1886
NVIDIA · Jun 25, 2024 · df8109c · df8109c
1 parent 3df3bcf
commit df8109c
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 5 deletions.
diff --git a/thrust/testing/async_reduce.cu b/thrust/testing/async_reduce.cu
@@ -836,4 +836,45 @@ DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_async_copy_then_reduce, BuiltinNu
 
 // TODO: when_all from reductions.
 
+// See also issue: https://github.com/NVIDIA/cccl/issues/1886
+struct test_async_reduce_bug1886
+{
+  struct tuple_sum
+  {
+    __device__ thrust::tuple<int, int>
+    operator()(const thrust::tuple<int, int>& t1, const thrust::tuple<int, int>& t2) const
+    {
+      return thrust::make_tuple(thrust::get<0>(t1) + thrust::get<0>(t2), thrust::get<1>(t1) + thrust::get<1>(t2));
+    }
+  };
+
+  void operator()() const
+  {
+    // Initialize input data
+    thrust::device_vector<int> d_data1{1, 2, 3, 4, 5};
+    thrust::device_vector<int> d_data2{10, 20, 30, 40, 50};
+
+    using TupleType = thrust::tuple<int, int>;
+    using IteratorType =
+      thrust::zip_iterator<thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator>>;
+
+    // Create zip_begin and zip_end iterators
+    IteratorType zip_begin = thrust::make_zip_iterator(thrust::make_tuple(d_data1.begin(), d_data2.begin()));
+    IteratorType zip_end   = thrust::make_zip_iterator(thrust::make_tuple(d_data2.end(), d_data2.end()));
+
+    // Initialize the starting tuple
+    TupleType init = thrust::make_tuple(0, 0);
+
+    // Perform async reduce using zip_begin and zip_end
+    auto future = thrust::async::reduce(thrust::device, zip_begin, zip_end, init, tuple_sum());
+
+    // Get the result
+    TupleType result = future.get();
+
+    // Print the result
+    std::cout << "Sum: (" << thrust::get<0>(result) << ", " << thrust::get<1>(result) << ")" << std::endl;
+  }
+};
+DECLARE_UNITTEST(test_async_reduce_bug1886);
+
 #endif
diff --git a/thrust/thrust/system/cuda/detail/async/reduce.h b/thrust/thrust/system/cuda/detail/async/reduce.h
@@ -141,7 +141,7 @@ namespace cuda_cub
 // ADL entry point.
 template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typename T, typename BinaryOp>
 auto async_reduce(execution_policy<DerivedPolicy>& policy, ForwardIt first, Sentinel last, T init, BinaryOp op)
-  THRUST_RETURNS(thrust::system::cuda::detail::async_reduce_n(policy, first, distance(first, last), init, op))
+  THRUST_RETURNS(thrust::system::cuda::detail::async_reduce_n(policy, first, thrust::distance(first, last), init, op))
 
 } // namespace cuda_cub
 
@@ -222,7 +222,7 @@ template <typename DerivedPolicy, typename ForwardIt, typename Sentinel, typenam
 auto async_reduce_into(
   execution_policy<DerivedPolicy>& policy, ForwardIt first, Sentinel last, OutputIt output, T init, BinaryOp op)
   THRUST_RETURNS(
-    thrust::system::cuda::detail::async_reduce_into_n(policy, first, distance(first, last), output, init, op))
+    thrust::system::cuda::detail::async_reduce_into_n(policy, first, thrust::distance(first, last), output, init, op))
 
 } // namespace cuda_cub
 

diff --git a/thrust/thrust/system/cuda/detail/future.inl b/thrust/thrust/system/cuda/detail/future.inl
@@ -37,6 +37,8 @@
 #  include <thrust/type_traits/integer_sequence.h>
 #  include <thrust/type_traits/remove_cvref.h>
 
+#  include <cuda/std/__memory/unique_ptr.h>
+
 #  include <type_traits>
 
 THRUST_NAMESPACE_BEGIN
@@ -669,7 +671,7 @@ public:
   _CCCL_HOST explicit unique_eager_event(unique_eager_future<U>&& other)
       // NOTE: We upcast to `unique_ptr<async_signal>` here.
       : device_(other.where())
-      , async_signal_(std::move(other.async_signal_))
+      , async_signal_(other.async_signal_.release())
   {}
 
   _CCCL_HOST
@@ -758,11 +760,11 @@ struct unique_eager_future final
 
 private:
   int device_ = 0;
-  std::unique_ptr<detail::async_value<value_type>> async_signal_;
+  ::cuda::std::unique_ptr<detail::async_value<value_type>> async_signal_;
 
   _CCCL_HOST explicit unique_eager_future(int device_id, std::unique_ptr<detail::async_value<value_type>> async_signal)
       : device_(device_id)
-      , async_signal_(std::move(async_signal))
+      , async_signal_(async_signal.release())
   {}
 
 public: