From a3a5f9c227c63c8f328ed2c181f0e935cc713eda Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Fri, 9 Aug 2024 10:46:24 -0700
Subject: [PATCH] CUDA `vector_add` sample project (#2160)

---------

Co-authored-by: pciolkosz <pciolkosz@nvidia.com>
Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 cudax/CMakeLists.txt                          |  13 +-
 cudax/cmake/cudaxBuildCompilerTargets.cmake   |   2 +-
 cudax/cmake/cudaxBuildTargetList.cmake        |   1 +
 .../cuda/experimental/__detail/utility.cuh    |  19 ++-
 .../cuda/experimental/__launch/param_kind.cuh |  85 ++++++++++
 cudax/include/cuda/experimental/launch.cuh    |   3 +
 cudax/samples/CMakeLists.txt                  |  76 +++++++++
 cudax/samples/cmake/CPM.cmake                 |  33 ++++
 cudax/samples/vector_add/param_kind.cuh       |  85 ++++++++++
 cudax/samples/vector_add/vector.cuh           | 151 ++++++++++++++++++
 cudax/samples/vector_add/vector_add.cu        | 127 +++++++++++++++
 11 files changed, 589 insertions(+), 6 deletions(-)
 create mode 100644 cudax/include/cuda/experimental/__launch/param_kind.cuh
 create mode 100755 cudax/samples/CMakeLists.txt
 create mode 100755 cudax/samples/cmake/CPM.cmake
 create mode 100644 cudax/samples/vector_add/param_kind.cuh
 create mode 100644 cudax/samples/vector_add/vector.cuh
 create mode 100644 cudax/samples/vector_add/vector_add.cu

diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt
index 4886562aca..f875cf8ebf 100644
--- a/cudax/CMakeLists.txt
+++ b/cudax/CMakeLists.txt
@@ -11,7 +11,7 @@ if (cudax_TOPLEVEL_PROJECT)
   cmake_minimum_required(VERSION 3.21)
 endif()
 
-project(cudax LANGUAGES CUDA)
+project(cudax LANGUAGES CUDA CXX)
 
 option(cudax_ENABLE_INSTALL_RULES "Enable installation of CUDA Experimental." ${cudax_TOPLEVEL_PROJECT})
 if (cudax_ENABLE_INSTALL_RULES)
@@ -25,6 +25,7 @@ endif()
 
 option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON)
 option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
+option(cudax_ENABLE_SAMPLES "Build CUDA Experimental's samples." ON)
 
 include(cmake/cudaxBuildCompilerTargets.cmake)
 include(cmake/cudaxBuildTargetList.cmake)
@@ -41,3 +42,13 @@ if (cudax_ENABLE_TESTING)
   enable_testing() # Must be in root directory
   add_subdirectory(test)
 endif()
+
+if (cudax_ENABLE_SAMPLES)
+  include(ExternalProject)
+  ExternalProject_Add(cudax_samples
+    PREFIX samples
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/samples"
+    BUILD_ALWAYS ON
+    INSTALL_COMMAND cmake -E echo "Skipping install step.")
+  add_dependencies(cudax.all cudax_samples)
+endif()
diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake
index 73aa9e376e..53cf7b8af4 100644
--- a/cudax/cmake/cudaxBuildCompilerTargets.cmake
+++ b/cudax/cmake/cudaxBuildCompilerTargets.cmake
@@ -9,7 +9,7 @@
 include("${cudax_SOURCE_DIR}/cmake/AppendOptionIfAvailable.cmake")
 
 function(cudax_build_compiler_targets)
-  set(cxx_compile_definitions)
+  set(cxx_compile_definitions LIBCUDACXX_ENABLE_EXCEPTIONS)
   set(cxx_compile_options)
   set(cuda_compile_options)
 
diff --git a/cudax/cmake/cudaxBuildTargetList.cmake b/cudax/cmake/cudaxBuildTargetList.cmake
index 63284dbe4a..2be17393dc 100644
--- a/cudax/cmake/cudaxBuildTargetList.cmake
+++ b/cudax/cmake/cudaxBuildTargetList.cmake
@@ -176,6 +176,7 @@ function(cudax_build_target_list)
   file(GLOB_RECURSE all_sources
     RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
     "${cudax_SOURCE_DIR}/include/cuda/experimental/*.hpp"
+    "${cudax_SOURCE_DIR}/include/cuda/experimental/*.cuh"
   )
   add_custom_target(cudax.all SOURCES ${all_sources})
 
diff --git a/cudax/include/cuda/experimental/__detail/utility.cuh b/cudax/include/cuda/experimental/__detail/utility.cuh
index 738a5d6244..1263ea880f 100644
--- a/cudax/include/cuda/experimental/__detail/utility.cuh
+++ b/cudax/include/cuda/experimental/__detail/utility.cuh
@@ -25,12 +25,23 @@ namespace cuda::experimental
 {
 namespace detail
 {
-struct __ignore
+// This is a helper type that can be used to ignore function arguments.
+struct [[maybe_unused]] __ignore
 {
-  template <typename... Args>
-  _CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept
+  __ignore() = default;
+
+  template <typename _Arg>
+  _CCCL_HOST_DEVICE constexpr __ignore(_Arg&&) noexcept
   {}
 };
+
+// Classes can inherit from this type to become immovable.
+struct __immovable
+{
+  __immovable()                         = default;
+  __immovable(__immovable&&)            = delete;
+  __immovable& operator=(__immovable&&) = delete;
+};
 } // namespace detail
 
 struct uninit_t
@@ -38,7 +49,7 @@ struct uninit_t
   explicit uninit_t() = default;
 };
 
-inline constexpr uninit_t uninit{};
+_CCCL_GLOBAL_CONSTANT uninit_t uninit{};
 } // namespace cuda::experimental
 
 #endif // __CUDAX_DETAIL_UTILITY_H
diff --git a/cudax/include/cuda/experimental/__launch/param_kind.cuh b/cudax/include/cuda/experimental/__launch/param_kind.cuh
new file mode 100644
index 0000000000..d50ebe49d3
--- /dev/null
+++ b/cudax/include/cuda/experimental/__launch/param_kind.cuh
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__LAUNCH_PARAM_KIND
+#define _CUDAX__LAUNCH_PARAM_KIND
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/maybe_const.h>
+
+#include <cuda/experimental/__detail/utility.cuh>
+
+namespace cuda::experimental
+{
+namespace detail
+{
+enum class __param_kind : unsigned
+{
+  _in    = 1,
+  _out   = 2,
+  _inout = 3
+};
+
+_CCCL_NODISCARD _CCCL_HOST_DEVICE inline constexpr __param_kind operator&(__param_kind __a, __param_kind __b) noexcept
+{
+  return __param_kind(unsigned(__a) & unsigned(__b));
+}
+
+template <typename _Ty, __param_kind _Kind>
+struct _CCCL_NODISCARD __box
+{
+  ::cuda::std::__maybe_const<_Kind == __param_kind::_in, _Ty>& __val;
+};
+
+struct __in_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_in> operator()(const _Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+struct __out_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_out> operator()(_Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+struct __inout_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_inout> operator()(_Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+} // namespace detail
+
+_CCCL_GLOBAL_CONSTANT detail::__in_t in{};
+_CCCL_GLOBAL_CONSTANT detail::__out_t out{};
+_CCCL_GLOBAL_CONSTANT detail::__inout_t inout{};
+
+} // namespace cuda::experimental
+
+#endif // _CUDAX__LAUNCH_PARAM_KIND
diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh
index 69048248ef..0bac26aa01 100644
--- a/cudax/include/cuda/experimental/launch.cuh
+++ b/cudax/include/cuda/experimental/launch.cuh
@@ -11,6 +11,9 @@
 #ifndef __CUDAX_LAUNCH___
 #define __CUDAX_LAUNCH___
 
+#include <cuda/experimental/__launch/configuration.cuh>
 #include <cuda/experimental/__launch/launch.cuh>
+#include <cuda/experimental/__launch/launch_transform.cuh>
+#include <cuda/experimental/__launch/param_kind.cuh>
 
 #endif // __CUDAX_LAUNCH___
diff --git a/cudax/samples/CMakeLists.txt b/cudax/samples/CMakeLists.txt
new file mode 100755
index 0000000000..df0985c1ad
--- /dev/null
+++ b/cudax/samples/CMakeLists.txt
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+project(CUDAX_SAMPLES CUDA CXX)
+
+# This example uses the CMake Package Manager (CPM) to simplify fetching CCCL from GitHub
+# For more information, see https://github.com/cpm-cmake/CPM.cmake
+include(cmake/CPM.cmake)
+
+# We define these as variables so they can be overriden in CI to pull from a PR instead of CCCL `main`
+# In your project, these variables are unncessary and you can just use the values directly
+set(CCCL_REPOSITORY "nvidia/cccl" CACHE STRING "GitHub repository to fetch CCCL from")
+set(CCCL_TAG "main" CACHE STRING "Git tag/branch to fetch from CCCL repository")
+
+# This will automatically clone CCCL from GitHub and make the exported cmake targets available
+CPMAddPackage(
+  NAME CCCL
+  GITHUB_REPOSITORY ${CCCL_REPOSITORY}
+  GIT_TAG ${CCCL_TAG}
+  GIT_SHALLOW ON
+  OPTIONS "CCCL_ENABLE_UNSTABLE ON"
+)
+
+# Default to building for the GPU on the current system
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 86)
+endif()
+
+# Creates a cmake executable target for the main program
+add_executable(vector_add vector_add/vector_add.cu)
+
+# "Links" the CCCL::cudax CMake target to the `vector_add` executable. This
+# configures everything needed to use CCCL's headers, including setting up
+# include paths, compiler flags, etc.
+target_link_libraries(vector_add
+  PUBLIC
+    CCCL::cudax
+    CCCL::CCCL
+    CCCL::Thrust
+    CCCL::libcudacxx
+  INTERFACE cudax.compiler_interface
+)
+
+# TODO: These are temporary until the main branch catches up with the latest changes
+target_compile_definitions(vector_add PUBLIC LIBCUDACXX_ENABLE_EXCEPTIONS)
+
+if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  # mdspan on windows only works in C++20 mode
+  target_compile_features(vector_add PUBLIC cxx_std_20)
+
+  # cudax requires dim3 to be usable from a constexpr context, and the CUDART headers require
+  # __cplusplus to be defined for this to work:
+  target_compile_options(vector_add PRIVATE
+    $<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus /Zc:preprocessor>
+    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcompiler=/Zc:__cplusplus -Xcompiler=/Zc:preprocessor>
+  )
+endif()
+
+# This is only relevant for internal testing and not needed by end users.
+include(CTest)
+enable_testing()
+add_test(NAME vector_add COMMAND vector_add)
diff --git a/cudax/samples/cmake/CPM.cmake b/cudax/samples/cmake/CPM.cmake
new file mode 100755
index 0000000000..a3086b791b
--- /dev/null
+++ b/cudax/samples/cmake/CPM.cmake
@@ -0,0 +1,33 @@
+set(CPM_DOWNLOAD_VERSION 0.38.1)
+
+if(CPM_SOURCE_CACHE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+# Expand relative path. This is important if the provided path contains a tilde (~)
+get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
+
+function(download_cpm)
+  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
+  file(DOWNLOAD
+       https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+       ${CPM_DOWNLOAD_LOCATION}
+  )
+endfunction()
+
+if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
+  download_cpm()
+else()
+  # resume download if it previously failed
+  file(READ ${CPM_DOWNLOAD_LOCATION} check)
+  if("${check}" STREQUAL "")
+    download_cpm()
+  endif()
+  unset(check)
+endif()
+
+include(${CPM_DOWNLOAD_LOCATION})
diff --git a/cudax/samples/vector_add/param_kind.cuh b/cudax/samples/vector_add/param_kind.cuh
new file mode 100644
index 0000000000..d50ebe49d3
--- /dev/null
+++ b/cudax/samples/vector_add/param_kind.cuh
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__LAUNCH_PARAM_KIND
+#define _CUDAX__LAUNCH_PARAM_KIND
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/maybe_const.h>
+
+#include <cuda/experimental/__detail/utility.cuh>
+
+namespace cuda::experimental
+{
+namespace detail
+{
+enum class __param_kind : unsigned
+{
+  _in    = 1,
+  _out   = 2,
+  _inout = 3
+};
+
+_CCCL_NODISCARD _CCCL_HOST_DEVICE inline constexpr __param_kind operator&(__param_kind __a, __param_kind __b) noexcept
+{
+  return __param_kind(unsigned(__a) & unsigned(__b));
+}
+
+template <typename _Ty, __param_kind _Kind>
+struct _CCCL_NODISCARD __box
+{
+  ::cuda::std::__maybe_const<_Kind == __param_kind::_in, _Ty>& __val;
+};
+
+struct __in_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_in> operator()(const _Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+struct __out_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_out> operator()(_Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+struct __inout_t
+{
+  template <class _Ty>
+  __box<_Ty, __param_kind::_inout> operator()(_Ty& __v) const noexcept
+  {
+    return {__v};
+  }
+};
+
+} // namespace detail
+
+_CCCL_GLOBAL_CONSTANT detail::__in_t in{};
+_CCCL_GLOBAL_CONSTANT detail::__out_t out{};
+_CCCL_GLOBAL_CONSTANT detail::__inout_t inout{};
+
+} // namespace cuda::experimental
+
+#endif // _CUDAX__LAUNCH_PARAM_KIND
diff --git a/cudax/samples/vector_add/vector.cuh b/cudax/samples/vector_add/vector.cuh
new file mode 100644
index 0000000000..7eef87f038
--- /dev/null
+++ b/cudax/samples/vector_add/vector.cuh
@@ -0,0 +1,151 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__CONTAINER_VECTOR
+#define _CUDAX__CONTAINER_VECTOR
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <cuda/std/__type_traits/maybe_const.h>
+#include <cuda/std/span>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/__detail/utility.cuh>
+
+#include "param_kind.cuh"
+
+#if _CCCL_STD_VER >= 2017
+namespace cuda::experimental
+{
+using ::cuda::std::span;
+using ::thrust::device_vector;
+using ::thrust::host_vector;
+
+template <typename _Ty>
+class vector
+{
+public:
+  vector() = default;
+  explicit vector(size_t __n)
+      : __h_(__n)
+  {}
+
+  _Ty& operator[](size_t __i) noexcept
+  {
+    __dirty_ = true;
+    return __h_[__i];
+  }
+
+  const _Ty& operator[](size_t __i) const noexcept
+  {
+    return __h_[__i];
+  }
+
+private:
+  void sync_host_to_device(stream_ref __str, detail::__param_kind __p) const
+  {
+    if (__dirty_)
+    {
+      if (__p == detail::__param_kind::_out)
+      {
+        // There's no need to copy the data from host to device if the data is
+        // only going to be written to. We can just allocate the device memory.
+        __d_.resize(__h_.size());
+      }
+      else
+      {
+        // TODO: use a memcpy async here
+        __d_ = __h_;
+      }
+      __dirty_ = false;
+    }
+  }
+
+  void sync_device_to_host(stream_ref __str, detail::__param_kind __p) const
+  {
+    if (__p != detail::__param_kind::_in)
+    {
+      // TODO: use a memcpy async here
+      __str.wait(); // wait for the kernel to finish executing
+      __h_ = __d_;
+    }
+  }
+
+  template <detail::__param_kind _Kind>
+  class __action //: private detail::__immovable
+  {
+    using __cv_vector = ::cuda::std::__maybe_const<_Kind == detail::__param_kind::_in, vector>;
+
+  public:
+    explicit __action(stream_ref __str, __cv_vector& __v) noexcept
+        : __str_(__str)
+        , __v_(__v)
+    {
+      __v_.sync_host_to_device(__str_, _Kind);
+    }
+
+    __action(__action&&) = delete;
+
+    ~__action()
+    {
+      __v_.sync_device_to_host(__str_, _Kind);
+    }
+
+    using __as_kernel_arg = ::cuda::std::span<_Ty>;
+
+    operator ::cuda::std::span<_Ty>()
+    {
+      return {__v_.__d_.data().get(), __v_.__d_.size()};
+    }
+
+  private:
+    stream_ref __str_;
+    __cv_vector& __v_;
+  };
+
+  _CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_inout>
+  __cudax_launch_transform(stream_ref __str, vector& __v) noexcept
+  {
+    return __action<detail::__param_kind::_inout>{__str, __v};
+  }
+
+  _CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_in>
+  __cudax_launch_transform(stream_ref __str, const vector& __v) noexcept
+  {
+    return __action<detail::__param_kind::_in>{__str, __v};
+  }
+
+  template <detail::__param_kind _Kind>
+  _CCCL_NODISCARD_FRIEND __action<_Kind>
+  __cudax_launch_transform(stream_ref __str, detail::__box<vector, _Kind> __b) noexcept
+  {
+    return __action<_Kind>{__str, __b.__val};
+  }
+
+  mutable host_vector<_Ty> __h_;
+  mutable device_vector<_Ty> __d_{};
+  mutable bool __dirty_ = true;
+};
+
+} // namespace cuda::experimental
+
+#endif
+#endif
diff --git a/cudax/samples/vector_add/vector_add.cu b/cudax/samples/vector_add/vector_add.cu
new file mode 100644
index 0000000000..784997e23d
--- /dev/null
+++ b/cudax/samples/vector_add/vector_add.cu
@@ -0,0 +1,127 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Vector addition: C = A + B.
+ *
+ * This sample is a very basic sample that implements element by element
+ * vector addition. It is the same as the sample illustrating Chapter 2
+ * of the programming guide with some additions like error checking.
+ */
+
+#include <stdio.h>
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+#include <cuda/std/span>
+
+#include <cuda/experimental/launch.cuh>
+#include <cuda/experimental/stream.cuh>
+
+#include "vector.cuh"
+
+namespace cudax = cuda::experimental;
+using cudax::in;
+using cudax::out;
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void vectorAdd(cudax::span<const float> A, cudax::span<const float> B, cudax::span<float> C)
+{
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < A.size())
+  {
+    C[i] = A[i] + B[i] + 0.0f;
+  }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+try
+{
+  // A CUDA stream on which to execute the vector addition kernel
+  cudax::stream stream(cudax::devices[0]);
+
+  // Print the vector length to be used, and compute its size
+  int numElements = 50000;
+  printf("[Vector addition of %d elements]\n", numElements);
+
+  // Allocate the host vectors
+  cudax::vector<float> A(numElements); // input
+  cudax::vector<float> B(numElements); // input
+  cudax::vector<float> C(numElements); // output
+
+  // Initialize the host input vectors
+  for (int i = 0; i < numElements; ++i)
+  {
+    A[i] = rand() / (float) RAND_MAX;
+    B[i] = rand() / (float) RAND_MAX;
+  }
+
+  // Define the kernel launch parameters
+  constexpr int threadsPerBlock = 256;
+  auto dims                     = cudax::distribute<threadsPerBlock>(numElements);
+
+  // Launch the vectorAdd kernel
+  printf("CUDA kernel launch with %d blocks of %d threads\n", dims.count(cudax::block, cudax::grid), threadsPerBlock);
+  cudax::launch(stream, dims, vectorAdd, in(A), in(B), out(C));
+
+  printf("waiting for the stream to finish\n");
+  stream.wait();
+
+  printf("veryfying the results\n");
+  // Verify that the result vector is correct
+  for (int i = 0; i < numElements; ++i)
+  {
+    if (fabs(A[i] + B[i] - C[i]) > 1e-5)
+    {
+      fprintf(stderr, "Result verification failed at element %d!\n", i);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  printf("Test PASSED\n");
+
+  printf("Done\n");
+  return 0;
+}
+catch (const std::exception& e)
+{
+  printf("caught an exception: \"%s\"\n", e.what());
+}
+catch (...)
+{
+  printf("caught an unknown exception\n");
+}