From a3a5f9c227c63c8f328ed2c181f0e935cc713eda Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Fri, 9 Aug 2024 10:46:24 -0700 Subject: [PATCH] CUDA `vector_add` sample project (#2160) --------- Co-authored-by: pciolkosz Co-authored-by: Michael Schellenberger Costa --- cudax/CMakeLists.txt | 13 +- cudax/cmake/cudaxBuildCompilerTargets.cmake | 2 +- cudax/cmake/cudaxBuildTargetList.cmake | 1 + .../cuda/experimental/__detail/utility.cuh | 19 ++- .../cuda/experimental/__launch/param_kind.cuh | 85 ++++++++++ cudax/include/cuda/experimental/launch.cuh | 3 + cudax/samples/CMakeLists.txt | 76 +++++++++ cudax/samples/cmake/CPM.cmake | 33 ++++ cudax/samples/vector_add/param_kind.cuh | 85 ++++++++++ cudax/samples/vector_add/vector.cuh | 151 ++++++++++++++++++ cudax/samples/vector_add/vector_add.cu | 127 +++++++++++++++ 11 files changed, 589 insertions(+), 6 deletions(-) create mode 100644 cudax/include/cuda/experimental/__launch/param_kind.cuh create mode 100755 cudax/samples/CMakeLists.txt create mode 100755 cudax/samples/cmake/CPM.cmake create mode 100644 cudax/samples/vector_add/param_kind.cuh create mode 100644 cudax/samples/vector_add/vector.cuh create mode 100644 cudax/samples/vector_add/vector_add.cu diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt index 4886562aca..f875cf8ebf 100644 --- a/cudax/CMakeLists.txt +++ b/cudax/CMakeLists.txt @@ -11,7 +11,7 @@ if (cudax_TOPLEVEL_PROJECT) cmake_minimum_required(VERSION 3.21) endif() -project(cudax LANGUAGES CUDA) +project(cudax LANGUAGES CUDA CXX) option(cudax_ENABLE_INSTALL_RULES "Enable installation of CUDA Experimental." ${cudax_TOPLEVEL_PROJECT}) if (cudax_ENABLE_INSTALL_RULES) @@ -25,6 +25,7 @@ endif() option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON) option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON) +option(cudax_ENABLE_SAMPLES "Build CUDA Experimental's samples." ON) include(cmake/cudaxBuildCompilerTargets.cmake) include(cmake/cudaxBuildTargetList.cmake) @@ -41,3 +42,13 @@ if (cudax_ENABLE_TESTING) enable_testing() # Must be in root directory add_subdirectory(test) endif() + +if (cudax_ENABLE_SAMPLES) + include(ExternalProject) + ExternalProject_Add(cudax_samples + PREFIX samples + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/samples" + BUILD_ALWAYS ON + INSTALL_COMMAND cmake -E echo "Skipping install step.") + add_dependencies(cudax.all cudax_samples) +endif() diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake index 73aa9e376e..53cf7b8af4 100644 --- a/cudax/cmake/cudaxBuildCompilerTargets.cmake +++ b/cudax/cmake/cudaxBuildCompilerTargets.cmake @@ -9,7 +9,7 @@ include("${cudax_SOURCE_DIR}/cmake/AppendOptionIfAvailable.cmake") function(cudax_build_compiler_targets) - set(cxx_compile_definitions) + set(cxx_compile_definitions LIBCUDACXX_ENABLE_EXCEPTIONS) set(cxx_compile_options) set(cuda_compile_options) diff --git a/cudax/cmake/cudaxBuildTargetList.cmake b/cudax/cmake/cudaxBuildTargetList.cmake index 63284dbe4a..2be17393dc 100644 --- a/cudax/cmake/cudaxBuildTargetList.cmake +++ b/cudax/cmake/cudaxBuildTargetList.cmake @@ -176,6 +176,7 @@ function(cudax_build_target_list) file(GLOB_RECURSE all_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${cudax_SOURCE_DIR}/include/cuda/experimental/*.hpp" + "${cudax_SOURCE_DIR}/include/cuda/experimental/*.cuh" ) add_custom_target(cudax.all SOURCES ${all_sources}) diff --git a/cudax/include/cuda/experimental/__detail/utility.cuh b/cudax/include/cuda/experimental/__detail/utility.cuh index 738a5d6244..1263ea880f 100644 --- a/cudax/include/cuda/experimental/__detail/utility.cuh +++ b/cudax/include/cuda/experimental/__detail/utility.cuh @@ -25,12 +25,23 @@ namespace cuda::experimental { namespace detail { -struct __ignore +// This is a helper type that can be used to ignore function arguments. +struct [[maybe_unused]] __ignore { - template - _CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept + __ignore() = default; + + template + _CCCL_HOST_DEVICE constexpr __ignore(_Arg&&) noexcept {} }; + +// Classes can inherit from this type to become immovable. +struct __immovable +{ + __immovable() = default; + __immovable(__immovable&&) = delete; + __immovable& operator=(__immovable&&) = delete; +}; } // namespace detail struct uninit_t @@ -38,7 +49,7 @@ struct uninit_t explicit uninit_t() = default; }; -inline constexpr uninit_t uninit{}; +_CCCL_GLOBAL_CONSTANT uninit_t uninit{}; } // namespace cuda::experimental #endif // __CUDAX_DETAIL_UTILITY_H diff --git a/cudax/include/cuda/experimental/__launch/param_kind.cuh b/cudax/include/cuda/experimental/__launch/param_kind.cuh new file mode 100644 index 0000000000..d50ebe49d3 --- /dev/null +++ b/cudax/include/cuda/experimental/__launch/param_kind.cuh @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__LAUNCH_PARAM_KIND +#define _CUDAX__LAUNCH_PARAM_KIND + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental +{ +namespace detail +{ +enum class __param_kind : unsigned +{ + _in = 1, + _out = 2, + _inout = 3 +}; + +_CCCL_NODISCARD _CCCL_HOST_DEVICE inline constexpr __param_kind operator&(__param_kind __a, __param_kind __b) noexcept +{ + return __param_kind(unsigned(__a) & unsigned(__b)); +} + +template +struct _CCCL_NODISCARD __box +{ + ::cuda::std::__maybe_const<_Kind == __param_kind::_in, _Ty>& __val; +}; + +struct __in_t +{ + template + __box<_Ty, __param_kind::_in> operator()(const _Ty& __v) const noexcept + { + return {__v}; + } +}; + +struct __out_t +{ + template + __box<_Ty, __param_kind::_out> operator()(_Ty& __v) const noexcept + { + return {__v}; + } +}; + +struct __inout_t +{ + template + __box<_Ty, __param_kind::_inout> operator()(_Ty& __v) const noexcept + { + return {__v}; + } +}; + +} // namespace detail + +_CCCL_GLOBAL_CONSTANT detail::__in_t in{}; +_CCCL_GLOBAL_CONSTANT detail::__out_t out{}; +_CCCL_GLOBAL_CONSTANT detail::__inout_t inout{}; + +} // namespace cuda::experimental + +#endif // _CUDAX__LAUNCH_PARAM_KIND diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh index 69048248ef..0bac26aa01 100644 --- a/cudax/include/cuda/experimental/launch.cuh +++ b/cudax/include/cuda/experimental/launch.cuh @@ -11,6 +11,9 @@ #ifndef __CUDAX_LAUNCH___ #define __CUDAX_LAUNCH___ +#include #include +#include +#include #endif // __CUDAX_LAUNCH___ diff --git a/cudax/samples/CMakeLists.txt b/cudax/samples/CMakeLists.txt new file mode 100755 index 0000000000..df0985c1ad --- /dev/null +++ b/cudax/samples/CMakeLists.txt @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(CUDAX_SAMPLES CUDA CXX) + +# This example uses the CMake Package Manager (CPM) to simplify fetching CCCL from GitHub +# For more information, see https://github.com/cpm-cmake/CPM.cmake +include(cmake/CPM.cmake) + +# We define these as variables so they can be overriden in CI to pull from a PR instead of CCCL `main` +# In your project, these variables are unncessary and you can just use the values directly +set(CCCL_REPOSITORY "nvidia/cccl" CACHE STRING "GitHub repository to fetch CCCL from") +set(CCCL_TAG "main" CACHE STRING "Git tag/branch to fetch from CCCL repository") + +# This will automatically clone CCCL from GitHub and make the exported cmake targets available +CPMAddPackage( + NAME CCCL + GITHUB_REPOSITORY ${CCCL_REPOSITORY} + GIT_TAG ${CCCL_TAG} + GIT_SHALLOW ON + OPTIONS "CCCL_ENABLE_UNSTABLE ON" +) + +# Default to building for the GPU on the current system +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 86) +endif() + +# Creates a cmake executable target for the main program +add_executable(vector_add vector_add/vector_add.cu) + +# "Links" the CCCL::cudax CMake target to the `vector_add` executable. This +# configures everything needed to use CCCL's headers, including setting up +# include paths, compiler flags, etc. +target_link_libraries(vector_add + PUBLIC + CCCL::cudax + CCCL::CCCL + CCCL::Thrust + CCCL::libcudacxx + INTERFACE cudax.compiler_interface +) + +# TODO: These are temporary until the main branch catches up with the latest changes +target_compile_definitions(vector_add PUBLIC LIBCUDACXX_ENABLE_EXCEPTIONS) + +if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + # mdspan on windows only works in C++20 mode + target_compile_features(vector_add PUBLIC cxx_std_20) + + # cudax requires dim3 to be usable from a constexpr context, and the CUDART headers require + # __cplusplus to be defined for this to work: + target_compile_options(vector_add PRIVATE + $<$:/Zc:__cplusplus /Zc:preprocessor> + $<$:-Xcompiler=/Zc:__cplusplus -Xcompiler=/Zc:preprocessor> + ) +endif() + +# This is only relevant for internal testing and not needed by end users. +include(CTest) +enable_testing() +add_test(NAME vector_add COMMAND vector_add) diff --git a/cudax/samples/cmake/CPM.cmake b/cudax/samples/cmake/CPM.cmake new file mode 100755 index 0000000000..a3086b791b --- /dev/null +++ b/cudax/samples/cmake/CPM.cmake @@ -0,0 +1,33 @@ +set(CPM_DOWNLOAD_VERSION 0.38.1) + +if(CPM_SOURCE_CACHE) + set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +elseif(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +else() + set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +endif() + +# Expand relative path. This is important if the provided path contains a tilde (~) +get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) + +function(download_cpm) + message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") + file(DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake + ${CPM_DOWNLOAD_LOCATION} + ) +endfunction() + +if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) + download_cpm() +else() + # resume download if it previously failed + file(READ ${CPM_DOWNLOAD_LOCATION} check) + if("${check}" STREQUAL "") + download_cpm() + endif() + unset(check) +endif() + +include(${CPM_DOWNLOAD_LOCATION}) diff --git a/cudax/samples/vector_add/param_kind.cuh b/cudax/samples/vector_add/param_kind.cuh new file mode 100644 index 0000000000..d50ebe49d3 --- /dev/null +++ b/cudax/samples/vector_add/param_kind.cuh @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__LAUNCH_PARAM_KIND +#define _CUDAX__LAUNCH_PARAM_KIND + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental +{ +namespace detail +{ +enum class __param_kind : unsigned +{ + _in = 1, + _out = 2, + _inout = 3 +}; + +_CCCL_NODISCARD _CCCL_HOST_DEVICE inline constexpr __param_kind operator&(__param_kind __a, __param_kind __b) noexcept +{ + return __param_kind(unsigned(__a) & unsigned(__b)); +} + +template +struct _CCCL_NODISCARD __box +{ + ::cuda::std::__maybe_const<_Kind == __param_kind::_in, _Ty>& __val; +}; + +struct __in_t +{ + template + __box<_Ty, __param_kind::_in> operator()(const _Ty& __v) const noexcept + { + return {__v}; + } +}; + +struct __out_t +{ + template + __box<_Ty, __param_kind::_out> operator()(_Ty& __v) const noexcept + { + return {__v}; + } +}; + +struct __inout_t +{ + template + __box<_Ty, __param_kind::_inout> operator()(_Ty& __v) const noexcept + { + return {__v}; + } +}; + +} // namespace detail + +_CCCL_GLOBAL_CONSTANT detail::__in_t in{}; +_CCCL_GLOBAL_CONSTANT detail::__out_t out{}; +_CCCL_GLOBAL_CONSTANT detail::__inout_t inout{}; + +} // namespace cuda::experimental + +#endif // _CUDAX__LAUNCH_PARAM_KIND diff --git a/cudax/samples/vector_add/vector.cuh b/cudax/samples/vector_add/vector.cuh new file mode 100644 index 0000000000..7eef87f038 --- /dev/null +++ b/cudax/samples/vector_add/vector.cuh @@ -0,0 +1,151 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__CONTAINER_VECTOR +#define _CUDAX__CONTAINER_VECTOR + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include +#include +#include + +#include + +#include "param_kind.cuh" + +#if _CCCL_STD_VER >= 2017 +namespace cuda::experimental +{ +using ::cuda::std::span; +using ::thrust::device_vector; +using ::thrust::host_vector; + +template +class vector +{ +public: + vector() = default; + explicit vector(size_t __n) + : __h_(__n) + {} + + _Ty& operator[](size_t __i) noexcept + { + __dirty_ = true; + return __h_[__i]; + } + + const _Ty& operator[](size_t __i) const noexcept + { + return __h_[__i]; + } + +private: + void sync_host_to_device(stream_ref __str, detail::__param_kind __p) const + { + if (__dirty_) + { + if (__p == detail::__param_kind::_out) + { + // There's no need to copy the data from host to device if the data is + // only going to be written to. We can just allocate the device memory. + __d_.resize(__h_.size()); + } + else + { + // TODO: use a memcpy async here + __d_ = __h_; + } + __dirty_ = false; + } + } + + void sync_device_to_host(stream_ref __str, detail::__param_kind __p) const + { + if (__p != detail::__param_kind::_in) + { + // TODO: use a memcpy async here + __str.wait(); // wait for the kernel to finish executing + __h_ = __d_; + } + } + + template + class __action //: private detail::__immovable + { + using __cv_vector = ::cuda::std::__maybe_const<_Kind == detail::__param_kind::_in, vector>; + + public: + explicit __action(stream_ref __str, __cv_vector& __v) noexcept + : __str_(__str) + , __v_(__v) + { + __v_.sync_host_to_device(__str_, _Kind); + } + + __action(__action&&) = delete; + + ~__action() + { + __v_.sync_device_to_host(__str_, _Kind); + } + + using __as_kernel_arg = ::cuda::std::span<_Ty>; + + operator ::cuda::std::span<_Ty>() + { + return {__v_.__d_.data().get(), __v_.__d_.size()}; + } + + private: + stream_ref __str_; + __cv_vector& __v_; + }; + + _CCCL_NODISCARD_FRIEND __action + __cudax_launch_transform(stream_ref __str, vector& __v) noexcept + { + return __action{__str, __v}; + } + + _CCCL_NODISCARD_FRIEND __action + __cudax_launch_transform(stream_ref __str, const vector& __v) noexcept + { + return __action{__str, __v}; + } + + template + _CCCL_NODISCARD_FRIEND __action<_Kind> + __cudax_launch_transform(stream_ref __str, detail::__box __b) noexcept + { + return __action<_Kind>{__str, __b.__val}; + } + + mutable host_vector<_Ty> __h_; + mutable device_vector<_Ty> __d_{}; + mutable bool __dirty_ = true; +}; + +} // namespace cuda::experimental + +#endif +#endif diff --git a/cudax/samples/vector_add/vector_add.cu b/cudax/samples/vector_add/vector_add.cu new file mode 100644 index 0000000000..784997e23d --- /dev/null +++ b/cudax/samples/vector_add/vector_add.cu @@ -0,0 +1,127 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Vector addition: C = A + B. + * + * This sample is a very basic sample that implements element by element + * vector addition. It is the same as the sample illustrating Chapter 2 + * of the programming guide with some additions like error checking. + */ + +#include + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +#include + +#include +#include + +#include "vector.cuh" + +namespace cudax = cuda::experimental; +using cudax::in; +using cudax::out; + +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ +__global__ void vectorAdd(cudax::span A, cudax::span B, cudax::span C) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < A.size()) + { + C[i] = A[i] + B[i] + 0.0f; + } +} + +/** + * Host main routine + */ +int main(void) +try +{ + // A CUDA stream on which to execute the vector addition kernel + cudax::stream stream(cudax::devices[0]); + + // Print the vector length to be used, and compute its size + int numElements = 50000; + printf("[Vector addition of %d elements]\n", numElements); + + // Allocate the host vectors + cudax::vector A(numElements); // input + cudax::vector B(numElements); // input + cudax::vector C(numElements); // output + + // Initialize the host input vectors + for (int i = 0; i < numElements; ++i) + { + A[i] = rand() / (float) RAND_MAX; + B[i] = rand() / (float) RAND_MAX; + } + + // Define the kernel launch parameters + constexpr int threadsPerBlock = 256; + auto dims = cudax::distribute(numElements); + + // Launch the vectorAdd kernel + printf("CUDA kernel launch with %d blocks of %d threads\n", dims.count(cudax::block, cudax::grid), threadsPerBlock); + cudax::launch(stream, dims, vectorAdd, in(A), in(B), out(C)); + + printf("waiting for the stream to finish\n"); + stream.wait(); + + printf("veryfying the results\n"); + // Verify that the result vector is correct + for (int i = 0; i < numElements; ++i) + { + if (fabs(A[i] + B[i] - C[i]) > 1e-5) + { + fprintf(stderr, "Result verification failed at element %d!\n", i); + exit(EXIT_FAILURE); + } + } + + printf("Test PASSED\n"); + + printf("Done\n"); + return 0; +} +catch (const std::exception& e) +{ + printf("caught an exception: \"%s\"\n", e.what()); +} +catch (...) +{ + printf("caught an unknown exception\n"); +}